diff --git "a/checkpoints/Qwen2.5-3B/babylm_hop_control_10M_seed0/runs/checkpoint-1122/trainer_state.json" "b/checkpoints/Qwen2.5-3B/babylm_hop_control_10M_seed0/runs/checkpoint-1122/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoints/Qwen2.5-3B/babylm_hop_control_10M_seed0/runs/checkpoint-1122/trainer_state.json" @@ -0,0 +1,8783 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 10, + "global_step": 1122, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0017825311942959, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6857, + "step": 1 + }, + { + "epoch": 0.0035650623885918, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6495, + "step": 2 + }, + { + "epoch": 0.0053475935828877, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6251, + "step": 3 + }, + { + "epoch": 0.0071301247771836, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6453, + "step": 4 + }, + { + "epoch": 0.008912655971479501, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6462, + "step": 5 + }, + { + "epoch": 0.0106951871657754, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6457, + "step": 6 + }, + { + "epoch": 0.012477718360071301, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.677, + "step": 7 + }, + { + "epoch": 0.0142602495543672, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6447, + "step": 8 + }, + { + "epoch": 0.016042780748663103, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6675, + "step": 9 + }, + { + "epoch": 0.017825311942959002, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6563, + "step": 10 + }, + { + "epoch": 0.017825311942959002, + "eval_loss": 1.6535868644714355, + "eval_runtime": 24.5668, + "eval_samples_per_second": 40.705, + "eval_steps_per_second": 2.564, + "step": 10 + }, + { + "epoch": 0.0196078431372549, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6458, + "step": 11 + }, + { + "epoch": 0.0213903743315508, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6598, + "step": 12 + }, + { + "epoch": 0.023172905525846704, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6628, + "step": 13 + }, + { + "epoch": 0.024955436720142603, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6816, + "step": 14 + }, + { + "epoch": 0.026737967914438502, + "grad_norm": 1.980492115020752, + "learning_rate": 4.424778761061947e-08, + "loss": 1.6526, + "step": 15 + }, + { + "epoch": 0.0285204991087344, + "grad_norm": 1.9155012369155884, + "learning_rate": 8.849557522123894e-08, + "loss": 1.6552, + "step": 16 + }, + { + "epoch": 0.030303030303030304, + "grad_norm": 1.8875389099121094, + "learning_rate": 1.327433628318584e-07, + "loss": 1.6684, + "step": 17 + }, + { + "epoch": 0.03208556149732621, + "grad_norm": 1.8125866651535034, + "learning_rate": 1.7699115044247788e-07, + "loss": 1.6549, + "step": 18 + }, + { + "epoch": 0.0338680926916221, + "grad_norm": 1.8792786598205566, + "learning_rate": 2.2123893805309737e-07, + "loss": 1.6487, + "step": 19 + }, + { + "epoch": 0.035650623885918005, + "grad_norm": 1.973883032798767, + "learning_rate": 2.654867256637168e-07, + "loss": 1.6671, + "step": 20 + }, + { + "epoch": 0.035650623885918005, + "eval_loss": 1.6527129411697388, + "eval_runtime": 24.8773, + "eval_samples_per_second": 40.197, + "eval_steps_per_second": 2.532, + "step": 20 + }, + { + "epoch": 0.0374331550802139, + "grad_norm": 1.84323251247406, + "learning_rate": 3.097345132743363e-07, + "loss": 1.6599, + "step": 21 + }, + { + "epoch": 0.0392156862745098, + "grad_norm": 1.8644572496414185, + "learning_rate": 3.5398230088495575e-07, + "loss": 1.6475, + "step": 22 + }, + { + "epoch": 0.040998217468805706, + "grad_norm": 1.860310435295105, + "learning_rate": 3.9823008849557525e-07, + "loss": 1.6656, + "step": 23 + }, + { + "epoch": 0.0427807486631016, + "grad_norm": 1.7291864156723022, + "learning_rate": 4.4247787610619474e-07, + "loss": 1.6542, + "step": 24 + }, + { + "epoch": 0.044563279857397504, + "grad_norm": 1.6295312643051147, + "learning_rate": 4.867256637168142e-07, + "loss": 1.661, + "step": 25 + }, + { + "epoch": 0.04634581105169341, + "grad_norm": 1.5132776498794556, + "learning_rate": 5.309734513274336e-07, + "loss": 1.6399, + "step": 26 + }, + { + "epoch": 0.0481283422459893, + "grad_norm": 1.5132776498794556, + "learning_rate": 5.309734513274336e-07, + "loss": 1.6567, + "step": 27 + }, + { + "epoch": 0.049910873440285206, + "grad_norm": 1.483805775642395, + "learning_rate": 5.752212389380532e-07, + "loss": 1.6131, + "step": 28 + }, + { + "epoch": 0.05169340463458111, + "grad_norm": 1.5470143556594849, + "learning_rate": 6.194690265486726e-07, + "loss": 1.6696, + "step": 29 + }, + { + "epoch": 0.053475935828877004, + "grad_norm": 1.3218852281570435, + "learning_rate": 6.637168141592922e-07, + "loss": 1.6567, + "step": 30 + }, + { + "epoch": 0.053475935828877004, + "eval_loss": 1.628255844116211, + "eval_runtime": 25.3856, + "eval_samples_per_second": 39.392, + "eval_steps_per_second": 2.482, + "step": 30 + }, + { + "epoch": 0.05525846702317291, + "grad_norm": 1.229565978050232, + "learning_rate": 7.079646017699115e-07, + "loss": 1.6412, + "step": 31 + }, + { + "epoch": 0.0570409982174688, + "grad_norm": 1.091683268547058, + "learning_rate": 7.522123893805311e-07, + "loss": 1.6232, + "step": 32 + }, + { + "epoch": 0.058823529411764705, + "grad_norm": 1.1046212911605835, + "learning_rate": 7.964601769911505e-07, + "loss": 1.6024, + "step": 33 + }, + { + "epoch": 0.06060606060606061, + "grad_norm": 1.0457117557525635, + "learning_rate": 8.4070796460177e-07, + "loss": 1.6285, + "step": 34 + }, + { + "epoch": 0.062388591800356503, + "grad_norm": 1.0139962434768677, + "learning_rate": 8.849557522123895e-07, + "loss": 1.6086, + "step": 35 + }, + { + "epoch": 0.06417112299465241, + "grad_norm": 0.9111472964286804, + "learning_rate": 9.292035398230089e-07, + "loss": 1.6246, + "step": 36 + }, + { + "epoch": 0.0659536541889483, + "grad_norm": 0.9822351336479187, + "learning_rate": 9.734513274336284e-07, + "loss": 1.5948, + "step": 37 + }, + { + "epoch": 0.0677361853832442, + "grad_norm": 0.9101211428642273, + "learning_rate": 1.017699115044248e-06, + "loss": 1.6035, + "step": 38 + }, + { + "epoch": 0.06951871657754011, + "grad_norm": 0.8859177827835083, + "learning_rate": 1.0619469026548673e-06, + "loss": 1.594, + "step": 39 + }, + { + "epoch": 0.07130124777183601, + "grad_norm": 0.9900261163711548, + "learning_rate": 1.106194690265487e-06, + "loss": 1.6154, + "step": 40 + }, + { + "epoch": 0.07130124777183601, + "eval_loss": 1.5900208950042725, + "eval_runtime": 25.6788, + "eval_samples_per_second": 38.943, + "eval_steps_per_second": 2.453, + "step": 40 + }, + { + "epoch": 0.07308377896613191, + "grad_norm": 0.8495362401008606, + "learning_rate": 1.1504424778761064e-06, + "loss": 1.606, + "step": 41 + }, + { + "epoch": 0.0748663101604278, + "grad_norm": 0.8944734334945679, + "learning_rate": 1.1946902654867258e-06, + "loss": 1.6102, + "step": 42 + }, + { + "epoch": 0.0766488413547237, + "grad_norm": 0.9594064354896545, + "learning_rate": 1.2389380530973452e-06, + "loss": 1.5851, + "step": 43 + }, + { + "epoch": 0.0784313725490196, + "grad_norm": 0.8346364498138428, + "learning_rate": 1.2831858407079647e-06, + "loss": 1.5855, + "step": 44 + }, + { + "epoch": 0.08021390374331551, + "grad_norm": 0.8732189536094666, + "learning_rate": 1.3274336283185843e-06, + "loss": 1.5751, + "step": 45 + }, + { + "epoch": 0.08199643493761141, + "grad_norm": 1.0120676755905151, + "learning_rate": 1.3716814159292036e-06, + "loss": 1.5736, + "step": 46 + }, + { + "epoch": 0.08377896613190731, + "grad_norm": 0.9650525450706482, + "learning_rate": 1.415929203539823e-06, + "loss": 1.5396, + "step": 47 + }, + { + "epoch": 0.0855614973262032, + "grad_norm": 0.9934496879577637, + "learning_rate": 1.4601769911504427e-06, + "loss": 1.5417, + "step": 48 + }, + { + "epoch": 0.0873440285204991, + "grad_norm": 1.378520131111145, + "learning_rate": 1.5044247787610621e-06, + "loss": 1.5554, + "step": 49 + }, + { + "epoch": 0.08912655971479501, + "grad_norm": 1.302832841873169, + "learning_rate": 1.5486725663716816e-06, + "loss": 1.5449, + "step": 50 + }, + { + "epoch": 0.08912655971479501, + "eval_loss": 1.5264862775802612, + "eval_runtime": 25.8466, + "eval_samples_per_second": 38.69, + "eval_steps_per_second": 2.437, + "step": 50 + }, + { + "epoch": 0.09090909090909091, + "grad_norm": 1.2967264652252197, + "learning_rate": 1.592920353982301e-06, + "loss": 1.4994, + "step": 51 + }, + { + "epoch": 0.09269162210338681, + "grad_norm": 1.2239652872085571, + "learning_rate": 1.6371681415929204e-06, + "loss": 1.5091, + "step": 52 + }, + { + "epoch": 0.0944741532976827, + "grad_norm": 1.2619361877441406, + "learning_rate": 1.68141592920354e-06, + "loss": 1.4824, + "step": 53 + }, + { + "epoch": 0.0962566844919786, + "grad_norm": 1.3398172855377197, + "learning_rate": 1.7256637168141593e-06, + "loss": 1.4996, + "step": 54 + }, + { + "epoch": 0.09803921568627451, + "grad_norm": 1.4670956134796143, + "learning_rate": 1.769911504424779e-06, + "loss": 1.4814, + "step": 55 + }, + { + "epoch": 0.09982174688057041, + "grad_norm": 1.2660086154937744, + "learning_rate": 1.8141592920353984e-06, + "loss": 1.4613, + "step": 56 + }, + { + "epoch": 0.10160427807486631, + "grad_norm": 1.2482752799987793, + "learning_rate": 1.8584070796460179e-06, + "loss": 1.4812, + "step": 57 + }, + { + "epoch": 0.10338680926916222, + "grad_norm": 1.3673324584960938, + "learning_rate": 1.9026548672566373e-06, + "loss": 1.471, + "step": 58 + }, + { + "epoch": 0.1051693404634581, + "grad_norm": 1.3449939489364624, + "learning_rate": 1.9469026548672567e-06, + "loss": 1.4412, + "step": 59 + }, + { + "epoch": 0.10695187165775401, + "grad_norm": 1.447826862335205, + "learning_rate": 1.991150442477876e-06, + "loss": 1.4377, + "step": 60 + }, + { + "epoch": 0.10695187165775401, + "eval_loss": 1.429922103881836, + "eval_runtime": 26.0399, + "eval_samples_per_second": 38.403, + "eval_steps_per_second": 2.419, + "step": 60 + }, + { + "epoch": 0.10873440285204991, + "grad_norm": 1.2212026119232178, + "learning_rate": 2.035398230088496e-06, + "loss": 1.3949, + "step": 61 + }, + { + "epoch": 0.11051693404634581, + "grad_norm": 1.1740317344665527, + "learning_rate": 2.079646017699115e-06, + "loss": 1.4268, + "step": 62 + }, + { + "epoch": 0.11229946524064172, + "grad_norm": 1.2304081916809082, + "learning_rate": 2.1238938053097345e-06, + "loss": 1.4055, + "step": 63 + }, + { + "epoch": 0.1140819964349376, + "grad_norm": 1.1806448698043823, + "learning_rate": 2.1681415929203544e-06, + "loss": 1.4126, + "step": 64 + }, + { + "epoch": 0.11586452762923351, + "grad_norm": 1.227970004081726, + "learning_rate": 2.212389380530974e-06, + "loss": 1.4067, + "step": 65 + }, + { + "epoch": 0.11764705882352941, + "grad_norm": 1.094361424446106, + "learning_rate": 2.256637168141593e-06, + "loss": 1.3593, + "step": 66 + }, + { + "epoch": 0.11942959001782531, + "grad_norm": 1.2891759872436523, + "learning_rate": 2.3008849557522127e-06, + "loss": 1.3564, + "step": 67 + }, + { + "epoch": 0.12121212121212122, + "grad_norm": 1.0433226823806763, + "learning_rate": 2.345132743362832e-06, + "loss": 1.3763, + "step": 68 + }, + { + "epoch": 0.12299465240641712, + "grad_norm": 1.4282106161117554, + "learning_rate": 2.3893805309734516e-06, + "loss": 1.3554, + "step": 69 + }, + { + "epoch": 0.12477718360071301, + "grad_norm": 1.091366171836853, + "learning_rate": 2.433628318584071e-06, + "loss": 1.3529, + "step": 70 + }, + { + "epoch": 0.12477718360071301, + "eval_loss": 1.3576408624649048, + "eval_runtime": 26.0076, + "eval_samples_per_second": 38.45, + "eval_steps_per_second": 2.422, + "step": 70 + }, + { + "epoch": 0.1265597147950089, + "grad_norm": 1.1191158294677734, + "learning_rate": 2.4778761061946905e-06, + "loss": 1.3368, + "step": 71 + }, + { + "epoch": 0.12834224598930483, + "grad_norm": 1.3687219619750977, + "learning_rate": 2.52212389380531e-06, + "loss": 1.3596, + "step": 72 + }, + { + "epoch": 0.13012477718360071, + "grad_norm": 1.1358015537261963, + "learning_rate": 2.5663716814159294e-06, + "loss": 1.3482, + "step": 73 + }, + { + "epoch": 0.1319073083778966, + "grad_norm": 1.4303299188613892, + "learning_rate": 2.6106194690265492e-06, + "loss": 1.3242, + "step": 74 + }, + { + "epoch": 0.13368983957219252, + "grad_norm": 1.0334805250167847, + "learning_rate": 2.6548672566371687e-06, + "loss": 1.3609, + "step": 75 + }, + { + "epoch": 0.1354723707664884, + "grad_norm": 1.222535490989685, + "learning_rate": 2.6991150442477877e-06, + "loss": 1.3375, + "step": 76 + }, + { + "epoch": 0.13725490196078433, + "grad_norm": 1.3004604578018188, + "learning_rate": 2.743362831858407e-06, + "loss": 1.2845, + "step": 77 + }, + { + "epoch": 0.13903743315508021, + "grad_norm": 1.045078158378601, + "learning_rate": 2.7876106194690266e-06, + "loss": 1.306, + "step": 78 + }, + { + "epoch": 0.1408199643493761, + "grad_norm": 1.3703179359436035, + "learning_rate": 2.831858407079646e-06, + "loss": 1.3284, + "step": 79 + }, + { + "epoch": 0.14260249554367202, + "grad_norm": 1.216047763824463, + "learning_rate": 2.876106194690266e-06, + "loss": 1.2894, + "step": 80 + }, + { + "epoch": 0.14260249554367202, + "eval_loss": 1.3045498132705688, + "eval_runtime": 25.8145, + "eval_samples_per_second": 38.738, + "eval_steps_per_second": 2.44, + "step": 80 + }, + { + "epoch": 0.1443850267379679, + "grad_norm": 1.1438696384429932, + "learning_rate": 2.9203539823008853e-06, + "loss": 1.2767, + "step": 81 + }, + { + "epoch": 0.14616755793226383, + "grad_norm": 1.375326156616211, + "learning_rate": 2.9646017699115048e-06, + "loss": 1.3037, + "step": 82 + }, + { + "epoch": 0.14795008912655971, + "grad_norm": 1.17705500125885, + "learning_rate": 3.0088495575221242e-06, + "loss": 1.2736, + "step": 83 + }, + { + "epoch": 0.1497326203208556, + "grad_norm": 1.1131092309951782, + "learning_rate": 3.0530973451327432e-06, + "loss": 1.2743, + "step": 84 + }, + { + "epoch": 0.15151515151515152, + "grad_norm": 1.2165392637252808, + "learning_rate": 3.097345132743363e-06, + "loss": 1.2922, + "step": 85 + }, + { + "epoch": 0.1532976827094474, + "grad_norm": 1.2263474464416504, + "learning_rate": 3.1415929203539825e-06, + "loss": 1.2208, + "step": 86 + }, + { + "epoch": 0.15508021390374332, + "grad_norm": 1.1002565622329712, + "learning_rate": 3.185840707964602e-06, + "loss": 1.2437, + "step": 87 + }, + { + "epoch": 0.1568627450980392, + "grad_norm": 1.4416580200195312, + "learning_rate": 3.2300884955752214e-06, + "loss": 1.2713, + "step": 88 + }, + { + "epoch": 0.1586452762923351, + "grad_norm": 1.0780956745147705, + "learning_rate": 3.274336283185841e-06, + "loss": 1.2943, + "step": 89 + }, + { + "epoch": 0.16042780748663102, + "grad_norm": 1.2529618740081787, + "learning_rate": 3.3185840707964607e-06, + "loss": 1.2913, + "step": 90 + }, + { + "epoch": 0.16042780748663102, + "eval_loss": 1.2740551233291626, + "eval_runtime": 25.9997, + "eval_samples_per_second": 38.462, + "eval_steps_per_second": 2.423, + "step": 90 + }, + { + "epoch": 0.1622103386809269, + "grad_norm": 1.1674306392669678, + "learning_rate": 3.36283185840708e-06, + "loss": 1.2704, + "step": 91 + }, + { + "epoch": 0.16399286987522282, + "grad_norm": 1.5128580331802368, + "learning_rate": 3.407079646017699e-06, + "loss": 1.2821, + "step": 92 + }, + { + "epoch": 0.1657754010695187, + "grad_norm": 1.216848373413086, + "learning_rate": 3.4513274336283186e-06, + "loss": 1.2629, + "step": 93 + }, + { + "epoch": 0.16755793226381463, + "grad_norm": 1.2294068336486816, + "learning_rate": 3.495575221238938e-06, + "loss": 1.2856, + "step": 94 + }, + { + "epoch": 0.16934046345811052, + "grad_norm": 1.4784265756607056, + "learning_rate": 3.539823008849558e-06, + "loss": 1.273, + "step": 95 + }, + { + "epoch": 0.1711229946524064, + "grad_norm": 1.200931191444397, + "learning_rate": 3.5840707964601774e-06, + "loss": 1.2312, + "step": 96 + }, + { + "epoch": 0.17290552584670232, + "grad_norm": 2.0421743392944336, + "learning_rate": 3.628318584070797e-06, + "loss": 1.2427, + "step": 97 + }, + { + "epoch": 0.1746880570409982, + "grad_norm": 1.5405610799789429, + "learning_rate": 3.6725663716814163e-06, + "loss": 1.2691, + "step": 98 + }, + { + "epoch": 0.17647058823529413, + "grad_norm": 2.1598148345947266, + "learning_rate": 3.7168141592920357e-06, + "loss": 1.2134, + "step": 99 + }, + { + "epoch": 0.17825311942959002, + "grad_norm": 1.649290919303894, + "learning_rate": 3.7610619469026547e-06, + "loss": 1.2532, + "step": 100 + }, + { + "epoch": 0.17825311942959002, + "eval_loss": 1.2498174905776978, + "eval_runtime": 25.8255, + "eval_samples_per_second": 38.721, + "eval_steps_per_second": 2.439, + "step": 100 + }, + { + "epoch": 0.1800356506238859, + "grad_norm": 2.003908157348633, + "learning_rate": 3.8053097345132746e-06, + "loss": 1.2372, + "step": 101 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 1.2919707298278809, + "learning_rate": 3.849557522123894e-06, + "loss": 1.2197, + "step": 102 + }, + { + "epoch": 0.1836007130124777, + "grad_norm": 1.8482425212860107, + "learning_rate": 3.8938053097345135e-06, + "loss": 1.1946, + "step": 103 + }, + { + "epoch": 0.18538324420677363, + "grad_norm": 1.3239375352859497, + "learning_rate": 3.938053097345133e-06, + "loss": 1.2045, + "step": 104 + }, + { + "epoch": 0.18716577540106952, + "grad_norm": 1.5264768600463867, + "learning_rate": 3.982300884955752e-06, + "loss": 1.2379, + "step": 105 + }, + { + "epoch": 0.1889483065953654, + "grad_norm": 1.1609495878219604, + "learning_rate": 4.026548672566372e-06, + "loss": 1.2109, + "step": 106 + }, + { + "epoch": 0.19073083778966132, + "grad_norm": 1.3971660137176514, + "learning_rate": 4.070796460176992e-06, + "loss": 1.2049, + "step": 107 + }, + { + "epoch": 0.1925133689839572, + "grad_norm": 1.2334355115890503, + "learning_rate": 4.115044247787611e-06, + "loss": 1.2317, + "step": 108 + }, + { + "epoch": 0.19429590017825313, + "grad_norm": 1.36234450340271, + "learning_rate": 4.15929203539823e-06, + "loss": 1.186, + "step": 109 + }, + { + "epoch": 0.19607843137254902, + "grad_norm": 1.2824509143829346, + "learning_rate": 4.20353982300885e-06, + "loss": 1.1683, + "step": 110 + }, + { + "epoch": 0.19607843137254902, + "eval_loss": 1.2226125001907349, + "eval_runtime": 26.0396, + "eval_samples_per_second": 38.403, + "eval_steps_per_second": 2.419, + "step": 110 + }, + { + "epoch": 0.19786096256684493, + "grad_norm": 1.3470525741577148, + "learning_rate": 4.247787610619469e-06, + "loss": 1.1557, + "step": 111 + }, + { + "epoch": 0.19964349376114082, + "grad_norm": 1.3658194541931152, + "learning_rate": 4.2920353982300885e-06, + "loss": 1.2355, + "step": 112 + }, + { + "epoch": 0.2014260249554367, + "grad_norm": 1.2262756824493408, + "learning_rate": 4.336283185840709e-06, + "loss": 1.214, + "step": 113 + }, + { + "epoch": 0.20320855614973263, + "grad_norm": 1.4536570310592651, + "learning_rate": 4.380530973451328e-06, + "loss": 1.2013, + "step": 114 + }, + { + "epoch": 0.20499108734402852, + "grad_norm": 1.4537997245788574, + "learning_rate": 4.424778761061948e-06, + "loss": 1.1879, + "step": 115 + }, + { + "epoch": 0.20677361853832443, + "grad_norm": 1.2539417743682861, + "learning_rate": 4.469026548672566e-06, + "loss": 1.2015, + "step": 116 + }, + { + "epoch": 0.20855614973262032, + "grad_norm": 1.296627163887024, + "learning_rate": 4.513274336283186e-06, + "loss": 1.1552, + "step": 117 + }, + { + "epoch": 0.2103386809269162, + "grad_norm": 1.5704238414764404, + "learning_rate": 4.557522123893805e-06, + "loss": 1.1468, + "step": 118 + }, + { + "epoch": 0.21212121212121213, + "grad_norm": 1.4134966135025024, + "learning_rate": 4.6017699115044254e-06, + "loss": 1.1689, + "step": 119 + }, + { + "epoch": 0.21390374331550802, + "grad_norm": 1.182852029800415, + "learning_rate": 4.646017699115045e-06, + "loss": 1.1733, + "step": 120 + }, + { + "epoch": 0.21390374331550802, + "eval_loss": 1.206061840057373, + "eval_runtime": 25.9192, + "eval_samples_per_second": 38.581, + "eval_steps_per_second": 2.431, + "step": 120 + }, + { + "epoch": 0.21568627450980393, + "grad_norm": 1.5393521785736084, + "learning_rate": 4.690265486725664e-06, + "loss": 1.1637, + "step": 121 + }, + { + "epoch": 0.21746880570409982, + "grad_norm": 1.4800235033035278, + "learning_rate": 4.734513274336284e-06, + "loss": 1.1724, + "step": 122 + }, + { + "epoch": 0.2192513368983957, + "grad_norm": 1.1709996461868286, + "learning_rate": 4.778761061946903e-06, + "loss": 1.1309, + "step": 123 + }, + { + "epoch": 0.22103386809269163, + "grad_norm": 1.5820499658584595, + "learning_rate": 4.823008849557523e-06, + "loss": 1.165, + "step": 124 + }, + { + "epoch": 0.22281639928698752, + "grad_norm": 1.4861419200897217, + "learning_rate": 4.867256637168142e-06, + "loss": 1.1958, + "step": 125 + }, + { + "epoch": 0.22459893048128343, + "grad_norm": 1.5460313558578491, + "learning_rate": 4.9115044247787615e-06, + "loss": 1.1264, + "step": 126 + }, + { + "epoch": 0.22638146167557932, + "grad_norm": 1.377894401550293, + "learning_rate": 4.955752212389381e-06, + "loss": 1.1516, + "step": 127 + }, + { + "epoch": 0.2281639928698752, + "grad_norm": 1.5216853618621826, + "learning_rate": 5e-06, + "loss": 1.1729, + "step": 128 + }, + { + "epoch": 0.22994652406417113, + "grad_norm": 1.1627072095870972, + "learning_rate": 4.995044598612488e-06, + "loss": 1.143, + "step": 129 + }, + { + "epoch": 0.23172905525846701, + "grad_norm": 1.6515153646469116, + "learning_rate": 4.990089197224976e-06, + "loss": 1.1501, + "step": 130 + }, + { + "epoch": 0.23172905525846701, + "eval_loss": 1.1917165517807007, + "eval_runtime": 25.9636, + "eval_samples_per_second": 38.515, + "eval_steps_per_second": 2.426, + "step": 130 + }, + { + "epoch": 0.23351158645276293, + "grad_norm": 1.3303382396697998, + "learning_rate": 4.985133795837464e-06, + "loss": 1.1089, + "step": 131 + }, + { + "epoch": 0.23529411764705882, + "grad_norm": 1.8258463144302368, + "learning_rate": 4.980178394449951e-06, + "loss": 1.1301, + "step": 132 + }, + { + "epoch": 0.23707664884135474, + "grad_norm": 1.3351376056671143, + "learning_rate": 4.975222993062438e-06, + "loss": 1.1711, + "step": 133 + }, + { + "epoch": 0.23885918003565063, + "grad_norm": 1.6486777067184448, + "learning_rate": 4.970267591674926e-06, + "loss": 1.167, + "step": 134 + }, + { + "epoch": 0.24064171122994651, + "grad_norm": 1.3900527954101562, + "learning_rate": 4.965312190287414e-06, + "loss": 1.168, + "step": 135 + }, + { + "epoch": 0.24242424242424243, + "grad_norm": 1.983830213546753, + "learning_rate": 4.960356788899901e-06, + "loss": 1.146, + "step": 136 + }, + { + "epoch": 0.24420677361853832, + "grad_norm": 1.6066932678222656, + "learning_rate": 4.955401387512389e-06, + "loss": 1.1859, + "step": 137 + }, + { + "epoch": 0.24598930481283424, + "grad_norm": 1.9494677782058716, + "learning_rate": 4.950445986124876e-06, + "loss": 1.1442, + "step": 138 + }, + { + "epoch": 0.24777183600713013, + "grad_norm": 1.7467000484466553, + "learning_rate": 4.945490584737364e-06, + "loss": 1.1342, + "step": 139 + }, + { + "epoch": 0.24955436720142601, + "grad_norm": 2.071423292160034, + "learning_rate": 4.9405351833498515e-06, + "loss": 1.1526, + "step": 140 + }, + { + "epoch": 0.24955436720142601, + "eval_loss": 1.1779279708862305, + "eval_runtime": 25.9977, + "eval_samples_per_second": 38.465, + "eval_steps_per_second": 2.423, + "step": 140 + }, + { + "epoch": 0.25133689839572193, + "grad_norm": 1.915077805519104, + "learning_rate": 4.935579781962339e-06, + "loss": 1.1236, + "step": 141 + }, + { + "epoch": 0.2531194295900178, + "grad_norm": 1.472425103187561, + "learning_rate": 4.930624380574827e-06, + "loss": 1.1368, + "step": 142 + }, + { + "epoch": 0.2549019607843137, + "grad_norm": 1.830802083015442, + "learning_rate": 4.925668979187315e-06, + "loss": 1.1103, + "step": 143 + }, + { + "epoch": 0.25668449197860965, + "grad_norm": 1.4757088422775269, + "learning_rate": 4.920713577799802e-06, + "loss": 1.1561, + "step": 144 + }, + { + "epoch": 0.25846702317290554, + "grad_norm": 1.6575653553009033, + "learning_rate": 4.915758176412289e-06, + "loss": 1.1341, + "step": 145 + }, + { + "epoch": 0.26024955436720143, + "grad_norm": 1.3438557386398315, + "learning_rate": 4.9108027750247775e-06, + "loss": 1.1037, + "step": 146 + }, + { + "epoch": 0.2620320855614973, + "grad_norm": 1.680197834968567, + "learning_rate": 4.9058473736372656e-06, + "loss": 1.1173, + "step": 147 + }, + { + "epoch": 0.2638146167557932, + "grad_norm": 1.567205548286438, + "learning_rate": 4.900891972249753e-06, + "loss": 1.1036, + "step": 148 + }, + { + "epoch": 0.26559714795008915, + "grad_norm": 1.6614781618118286, + "learning_rate": 4.89593657086224e-06, + "loss": 1.1118, + "step": 149 + }, + { + "epoch": 0.26737967914438504, + "grad_norm": 1.5362111330032349, + "learning_rate": 4.890981169474728e-06, + "loss": 1.1162, + "step": 150 + }, + { + "epoch": 0.26737967914438504, + "eval_loss": 1.1625056266784668, + "eval_runtime": 26.0288, + "eval_samples_per_second": 38.419, + "eval_steps_per_second": 2.42, + "step": 150 + }, + { + "epoch": 0.26916221033868093, + "grad_norm": 1.5577179193496704, + "learning_rate": 4.886025768087215e-06, + "loss": 1.1056, + "step": 151 + }, + { + "epoch": 0.2709447415329768, + "grad_norm": 1.5841162204742432, + "learning_rate": 4.881070366699703e-06, + "loss": 1.105, + "step": 152 + }, + { + "epoch": 0.2727272727272727, + "grad_norm": 1.4951591491699219, + "learning_rate": 4.876114965312191e-06, + "loss": 1.1074, + "step": 153 + }, + { + "epoch": 0.27450980392156865, + "grad_norm": 1.5688201189041138, + "learning_rate": 4.871159563924679e-06, + "loss": 1.0981, + "step": 154 + }, + { + "epoch": 0.27629233511586454, + "grad_norm": 1.4888694286346436, + "learning_rate": 4.866204162537166e-06, + "loss": 1.1283, + "step": 155 + }, + { + "epoch": 0.27807486631016043, + "grad_norm": 1.724908471107483, + "learning_rate": 4.861248761149653e-06, + "loss": 1.1136, + "step": 156 + }, + { + "epoch": 0.2798573975044563, + "grad_norm": 1.3840643167495728, + "learning_rate": 4.8562933597621405e-06, + "loss": 1.112, + "step": 157 + }, + { + "epoch": 0.2816399286987522, + "grad_norm": 1.334119200706482, + "learning_rate": 4.8513379583746286e-06, + "loss": 1.1134, + "step": 158 + }, + { + "epoch": 0.28342245989304815, + "grad_norm": 1.3652615547180176, + "learning_rate": 4.846382556987117e-06, + "loss": 1.1386, + "step": 159 + }, + { + "epoch": 0.28520499108734404, + "grad_norm": 1.440026879310608, + "learning_rate": 4.841427155599604e-06, + "loss": 1.109, + "step": 160 + }, + { + "epoch": 0.28520499108734404, + "eval_loss": 1.1494263410568237, + "eval_runtime": 26.1128, + "eval_samples_per_second": 38.295, + "eval_steps_per_second": 2.413, + "step": 160 + }, + { + "epoch": 0.28698752228163993, + "grad_norm": 1.5376060009002686, + "learning_rate": 4.836471754212091e-06, + "loss": 1.0814, + "step": 161 + }, + { + "epoch": 0.2887700534759358, + "grad_norm": 1.6225935220718384, + "learning_rate": 4.831516352824579e-06, + "loss": 1.1231, + "step": 162 + }, + { + "epoch": 0.2905525846702317, + "grad_norm": 1.4330049753189087, + "learning_rate": 4.826560951437067e-06, + "loss": 1.1283, + "step": 163 + }, + { + "epoch": 0.29233511586452765, + "grad_norm": 1.3375277519226074, + "learning_rate": 4.8216055500495545e-06, + "loss": 1.1047, + "step": 164 + }, + { + "epoch": 0.29411764705882354, + "grad_norm": 1.572200059890747, + "learning_rate": 4.816650148662042e-06, + "loss": 1.0868, + "step": 165 + }, + { + "epoch": 0.29590017825311943, + "grad_norm": 1.292799472808838, + "learning_rate": 4.81169474727453e-06, + "loss": 1.0593, + "step": 166 + }, + { + "epoch": 0.2976827094474153, + "grad_norm": 1.3536690473556519, + "learning_rate": 4.806739345887017e-06, + "loss": 1.1251, + "step": 167 + }, + { + "epoch": 0.2994652406417112, + "grad_norm": 1.395337462425232, + "learning_rate": 4.801783944499504e-06, + "loss": 1.0859, + "step": 168 + }, + { + "epoch": 0.30124777183600715, + "grad_norm": 1.4418623447418213, + "learning_rate": 4.7968285431119924e-06, + "loss": 1.0895, + "step": 169 + }, + { + "epoch": 0.30303030303030304, + "grad_norm": 1.2470104694366455, + "learning_rate": 4.7918731417244805e-06, + "loss": 1.0945, + "step": 170 + }, + { + "epoch": 0.30303030303030304, + "eval_loss": 1.1404340267181396, + "eval_runtime": 25.7303, + "eval_samples_per_second": 38.865, + "eval_steps_per_second": 2.448, + "step": 170 + }, + { + "epoch": 0.3048128342245989, + "grad_norm": 1.446307897567749, + "learning_rate": 4.786917740336968e-06, + "loss": 1.11, + "step": 171 + }, + { + "epoch": 0.3065953654188948, + "grad_norm": 1.2429949045181274, + "learning_rate": 4.781962338949455e-06, + "loss": 1.0945, + "step": 172 + }, + { + "epoch": 0.3083778966131907, + "grad_norm": 1.4229985475540161, + "learning_rate": 4.777006937561943e-06, + "loss": 1.0857, + "step": 173 + }, + { + "epoch": 0.31016042780748665, + "grad_norm": 1.2821723222732544, + "learning_rate": 4.77205153617443e-06, + "loss": 1.1064, + "step": 174 + }, + { + "epoch": 0.31194295900178254, + "grad_norm": 1.298244595527649, + "learning_rate": 4.767096134786918e-06, + "loss": 1.0695, + "step": 175 + }, + { + "epoch": 0.3137254901960784, + "grad_norm": 1.521951675415039, + "learning_rate": 4.762140733399406e-06, + "loss": 1.1152, + "step": 176 + }, + { + "epoch": 0.3155080213903743, + "grad_norm": 1.2742416858673096, + "learning_rate": 4.757185332011893e-06, + "loss": 1.0942, + "step": 177 + }, + { + "epoch": 0.3172905525846702, + "grad_norm": 1.5329335927963257, + "learning_rate": 4.752229930624381e-06, + "loss": 1.0916, + "step": 178 + }, + { + "epoch": 0.31907308377896615, + "grad_norm": 1.3044357299804688, + "learning_rate": 4.747274529236869e-06, + "loss": 1.0597, + "step": 179 + }, + { + "epoch": 0.32085561497326204, + "grad_norm": 1.341720700263977, + "learning_rate": 4.742319127849356e-06, + "loss": 1.11, + "step": 180 + }, + { + "epoch": 0.32085561497326204, + "eval_loss": 1.1308393478393555, + "eval_runtime": 25.8438, + "eval_samples_per_second": 38.694, + "eval_steps_per_second": 2.438, + "step": 180 + }, + { + "epoch": 0.3226381461675579, + "grad_norm": 1.2242933511734009, + "learning_rate": 4.7373637264618435e-06, + "loss": 1.0626, + "step": 181 + }, + { + "epoch": 0.3244206773618538, + "grad_norm": 1.4313316345214844, + "learning_rate": 4.732408325074332e-06, + "loss": 1.0998, + "step": 182 + }, + { + "epoch": 0.32620320855614976, + "grad_norm": 1.3618254661560059, + "learning_rate": 4.727452923686819e-06, + "loss": 1.1033, + "step": 183 + }, + { + "epoch": 0.32798573975044565, + "grad_norm": 1.5201796293258667, + "learning_rate": 4.722497522299306e-06, + "loss": 1.1185, + "step": 184 + }, + { + "epoch": 0.32976827094474154, + "grad_norm": 1.5577552318572998, + "learning_rate": 4.717542120911794e-06, + "loss": 1.0953, + "step": 185 + }, + { + "epoch": 0.3315508021390374, + "grad_norm": 1.3261568546295166, + "learning_rate": 4.712586719524282e-06, + "loss": 1.1042, + "step": 186 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 1.4353320598602295, + "learning_rate": 4.7076313181367695e-06, + "loss": 1.0968, + "step": 187 + }, + { + "epoch": 0.33511586452762926, + "grad_norm": 1.4878215789794922, + "learning_rate": 4.702675916749257e-06, + "loss": 1.1125, + "step": 188 + }, + { + "epoch": 0.33689839572192515, + "grad_norm": 1.2873917818069458, + "learning_rate": 4.697720515361745e-06, + "loss": 1.0628, + "step": 189 + }, + { + "epoch": 0.33868092691622104, + "grad_norm": 1.4027594327926636, + "learning_rate": 4.692765113974233e-06, + "loss": 1.1129, + "step": 190 + }, + { + "epoch": 0.33868092691622104, + "eval_loss": 1.1256904602050781, + "eval_runtime": 25.7332, + "eval_samples_per_second": 38.86, + "eval_steps_per_second": 2.448, + "step": 190 + }, + { + "epoch": 0.3404634581105169, + "grad_norm": 1.5468748807907104, + "learning_rate": 4.68780971258672e-06, + "loss": 1.0634, + "step": 191 + }, + { + "epoch": 0.3422459893048128, + "grad_norm": 1.4828784465789795, + "learning_rate": 4.682854311199207e-06, + "loss": 1.0971, + "step": 192 + }, + { + "epoch": 0.34402852049910876, + "grad_norm": 1.4721124172210693, + "learning_rate": 4.677898909811695e-06, + "loss": 1.0492, + "step": 193 + }, + { + "epoch": 0.34581105169340465, + "grad_norm": 1.4724591970443726, + "learning_rate": 4.672943508424183e-06, + "loss": 1.0611, + "step": 194 + }, + { + "epoch": 0.34759358288770054, + "grad_norm": 1.3990216255187988, + "learning_rate": 4.667988107036671e-06, + "loss": 1.0506, + "step": 195 + }, + { + "epoch": 0.3493761140819964, + "grad_norm": 1.7127360105514526, + "learning_rate": 4.663032705649158e-06, + "loss": 1.0718, + "step": 196 + }, + { + "epoch": 0.3511586452762923, + "grad_norm": 1.5870146751403809, + "learning_rate": 4.658077304261645e-06, + "loss": 1.0626, + "step": 197 + }, + { + "epoch": 0.35294117647058826, + "grad_norm": 1.4044660329818726, + "learning_rate": 4.653121902874133e-06, + "loss": 1.0746, + "step": 198 + }, + { + "epoch": 0.35472370766488415, + "grad_norm": 1.7337034940719604, + "learning_rate": 4.648166501486621e-06, + "loss": 1.0813, + "step": 199 + }, + { + "epoch": 0.35650623885918004, + "grad_norm": 1.44723379611969, + "learning_rate": 4.643211100099108e-06, + "loss": 1.0551, + "step": 200 + }, + { + "epoch": 0.35650623885918004, + "eval_loss": 1.1240613460540771, + "eval_runtime": 25.8921, + "eval_samples_per_second": 38.622, + "eval_steps_per_second": 2.433, + "step": 200 + }, + { + "epoch": 0.3582887700534759, + "grad_norm": 2.0049657821655273, + "learning_rate": 4.638255698711596e-06, + "loss": 1.091, + "step": 201 + }, + { + "epoch": 0.3600713012477718, + "grad_norm": 1.4606605768203735, + "learning_rate": 4.633300297324084e-06, + "loss": 1.096, + "step": 202 + }, + { + "epoch": 0.36185383244206776, + "grad_norm": 1.8128238916397095, + "learning_rate": 4.628344895936571e-06, + "loss": 1.0471, + "step": 203 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 1.587044358253479, + "learning_rate": 4.6233894945490585e-06, + "loss": 1.065, + "step": 204 + }, + { + "epoch": 0.36541889483065954, + "grad_norm": 1.5733635425567627, + "learning_rate": 4.6184340931615466e-06, + "loss": 1.056, + "step": 205 + }, + { + "epoch": 0.3672014260249554, + "grad_norm": 1.5961697101593018, + "learning_rate": 4.613478691774035e-06, + "loss": 1.0592, + "step": 206 + }, + { + "epoch": 0.3689839572192513, + "grad_norm": 1.2846550941467285, + "learning_rate": 4.608523290386522e-06, + "loss": 1.0498, + "step": 207 + }, + { + "epoch": 0.37076648841354726, + "grad_norm": 1.4773404598236084, + "learning_rate": 4.603567888999009e-06, + "loss": 1.0675, + "step": 208 + }, + { + "epoch": 0.37254901960784315, + "grad_norm": 1.4184978008270264, + "learning_rate": 4.598612487611497e-06, + "loss": 1.0922, + "step": 209 + }, + { + "epoch": 0.37433155080213903, + "grad_norm": 1.4357984066009521, + "learning_rate": 4.5936570862239844e-06, + "loss": 1.0521, + "step": 210 + }, + { + "epoch": 0.37433155080213903, + "eval_loss": 1.1120160818099976, + "eval_runtime": 25.758, + "eval_samples_per_second": 38.823, + "eval_steps_per_second": 2.446, + "step": 210 + }, + { + "epoch": 0.3761140819964349, + "grad_norm": 1.5778534412384033, + "learning_rate": 4.5887016848364725e-06, + "loss": 1.0603, + "step": 211 + }, + { + "epoch": 0.3778966131907308, + "grad_norm": 1.4360918998718262, + "learning_rate": 4.58374628344896e-06, + "loss": 1.0796, + "step": 212 + }, + { + "epoch": 0.37967914438502676, + "grad_norm": 1.4689946174621582, + "learning_rate": 4.578790882061447e-06, + "loss": 1.0271, + "step": 213 + }, + { + "epoch": 0.38146167557932265, + "grad_norm": 1.4524282217025757, + "learning_rate": 4.573835480673935e-06, + "loss": 1.0676, + "step": 214 + }, + { + "epoch": 0.38324420677361853, + "grad_norm": 1.6127585172653198, + "learning_rate": 4.568880079286422e-06, + "loss": 1.0708, + "step": 215 + }, + { + "epoch": 0.3850267379679144, + "grad_norm": 1.4047455787658691, + "learning_rate": 4.5639246778989096e-06, + "loss": 1.0813, + "step": 216 + }, + { + "epoch": 0.3868092691622103, + "grad_norm": 1.584825873374939, + "learning_rate": 4.558969276511398e-06, + "loss": 1.0287, + "step": 217 + }, + { + "epoch": 0.38859180035650626, + "grad_norm": 1.5703797340393066, + "learning_rate": 4.554013875123886e-06, + "loss": 1.0687, + "step": 218 + }, + { + "epoch": 0.39037433155080214, + "grad_norm": 1.5290143489837646, + "learning_rate": 4.549058473736373e-06, + "loss": 1.0528, + "step": 219 + }, + { + "epoch": 0.39215686274509803, + "grad_norm": 1.6070911884307861, + "learning_rate": 4.54410307234886e-06, + "loss": 1.0436, + "step": 220 + }, + { + "epoch": 0.39215686274509803, + "eval_loss": 1.1070743799209595, + "eval_runtime": 25.9372, + "eval_samples_per_second": 38.555, + "eval_steps_per_second": 2.429, + "step": 220 + }, + { + "epoch": 0.3939393939393939, + "grad_norm": 1.534654140472412, + "learning_rate": 4.539147670961348e-06, + "loss": 1.0744, + "step": 221 + }, + { + "epoch": 0.39572192513368987, + "grad_norm": 1.484359622001648, + "learning_rate": 4.534192269573836e-06, + "loss": 1.0548, + "step": 222 + }, + { + "epoch": 0.39750445632798576, + "grad_norm": 1.5490672588348389, + "learning_rate": 4.529236868186324e-06, + "loss": 1.0438, + "step": 223 + }, + { + "epoch": 0.39928698752228164, + "grad_norm": 1.7006093263626099, + "learning_rate": 4.524281466798811e-06, + "loss": 1.0321, + "step": 224 + }, + { + "epoch": 0.40106951871657753, + "grad_norm": 1.3219202756881714, + "learning_rate": 4.519326065411299e-06, + "loss": 1.0247, + "step": 225 + }, + { + "epoch": 0.4028520499108734, + "grad_norm": 1.5470346212387085, + "learning_rate": 4.514370664023786e-06, + "loss": 1.0203, + "step": 226 + }, + { + "epoch": 0.40463458110516937, + "grad_norm": 1.4391487836837769, + "learning_rate": 4.509415262636274e-06, + "loss": 1.0277, + "step": 227 + }, + { + "epoch": 0.40641711229946526, + "grad_norm": 1.3691346645355225, + "learning_rate": 4.5044598612487615e-06, + "loss": 1.0307, + "step": 228 + }, + { + "epoch": 0.40819964349376114, + "grad_norm": 1.5090476274490356, + "learning_rate": 4.499504459861249e-06, + "loss": 1.06, + "step": 229 + }, + { + "epoch": 0.40998217468805703, + "grad_norm": 1.660611629486084, + "learning_rate": 4.494549058473737e-06, + "loss": 1.0729, + "step": 230 + }, + { + "epoch": 0.40998217468805703, + "eval_loss": 1.100784182548523, + "eval_runtime": 26.0384, + "eval_samples_per_second": 38.405, + "eval_steps_per_second": 2.42, + "step": 230 + }, + { + "epoch": 0.4117647058823529, + "grad_norm": 1.3682957887649536, + "learning_rate": 4.489593657086224e-06, + "loss": 1.0211, + "step": 231 + }, + { + "epoch": 0.41354723707664887, + "grad_norm": 1.5248552560806274, + "learning_rate": 4.484638255698711e-06, + "loss": 1.0649, + "step": 232 + }, + { + "epoch": 0.41532976827094475, + "grad_norm": 1.4119149446487427, + "learning_rate": 4.479682854311199e-06, + "loss": 1.0396, + "step": 233 + }, + { + "epoch": 0.41711229946524064, + "grad_norm": 1.4445548057556152, + "learning_rate": 4.4747274529236875e-06, + "loss": 1.0238, + "step": 234 + }, + { + "epoch": 0.41889483065953653, + "grad_norm": 1.6393500566482544, + "learning_rate": 4.469772051536175e-06, + "loss": 1.0405, + "step": 235 + }, + { + "epoch": 0.4206773618538324, + "grad_norm": 1.4636632204055786, + "learning_rate": 4.464816650148662e-06, + "loss": 1.0533, + "step": 236 + }, + { + "epoch": 0.42245989304812837, + "grad_norm": 1.408185601234436, + "learning_rate": 4.45986124876115e-06, + "loss": 1.0441, + "step": 237 + }, + { + "epoch": 0.42424242424242425, + "grad_norm": 1.5546650886535645, + "learning_rate": 4.454905847373638e-06, + "loss": 1.0259, + "step": 238 + }, + { + "epoch": 0.42602495543672014, + "grad_norm": 1.3773846626281738, + "learning_rate": 4.449950445986125e-06, + "loss": 1.0317, + "step": 239 + }, + { + "epoch": 0.42780748663101603, + "grad_norm": 1.6242173910140991, + "learning_rate": 4.444995044598613e-06, + "loss": 1.0572, + "step": 240 + }, + { + "epoch": 0.42780748663101603, + "eval_loss": 1.0950753688812256, + "eval_runtime": 25.98, + "eval_samples_per_second": 38.491, + "eval_steps_per_second": 2.425, + "step": 240 + }, + { + "epoch": 0.4295900178253119, + "grad_norm": 1.510955572128296, + "learning_rate": 4.440039643211101e-06, + "loss": 1.0487, + "step": 241 + }, + { + "epoch": 0.43137254901960786, + "grad_norm": 1.5870273113250732, + "learning_rate": 4.435084241823588e-06, + "loss": 1.0126, + "step": 242 + }, + { + "epoch": 0.43315508021390375, + "grad_norm": 1.3809922933578491, + "learning_rate": 4.430128840436076e-06, + "loss": 1.0525, + "step": 243 + }, + { + "epoch": 0.43493761140819964, + "grad_norm": 1.558537244796753, + "learning_rate": 4.425173439048563e-06, + "loss": 1.0421, + "step": 244 + }, + { + "epoch": 0.43672014260249553, + "grad_norm": 1.4023057222366333, + "learning_rate": 4.420218037661051e-06, + "loss": 1.0897, + "step": 245 + }, + { + "epoch": 0.4385026737967914, + "grad_norm": 1.4013497829437256, + "learning_rate": 4.415262636273539e-06, + "loss": 1.0418, + "step": 246 + }, + { + "epoch": 0.44028520499108736, + "grad_norm": 1.672249674797058, + "learning_rate": 4.410307234886026e-06, + "loss": 1.045, + "step": 247 + }, + { + "epoch": 0.44206773618538325, + "grad_norm": 1.451650857925415, + "learning_rate": 4.405351833498513e-06, + "loss": 1.0193, + "step": 248 + }, + { + "epoch": 0.44385026737967914, + "grad_norm": 1.5913277864456177, + "learning_rate": 4.400396432111001e-06, + "loss": 1.0835, + "step": 249 + }, + { + "epoch": 0.44563279857397503, + "grad_norm": 1.5107896327972412, + "learning_rate": 4.395441030723489e-06, + "loss": 1.0468, + "step": 250 + }, + { + "epoch": 0.44563279857397503, + "eval_loss": 1.0920634269714355, + "eval_runtime": 25.9492, + "eval_samples_per_second": 38.537, + "eval_steps_per_second": 2.428, + "step": 250 + }, + { + "epoch": 0.4474153297682709, + "grad_norm": 1.3713324069976807, + "learning_rate": 4.3904856293359765e-06, + "loss": 1.011, + "step": 251 + }, + { + "epoch": 0.44919786096256686, + "grad_norm": 1.429887294769287, + "learning_rate": 4.385530227948464e-06, + "loss": 1.0337, + "step": 252 + }, + { + "epoch": 0.45098039215686275, + "grad_norm": 1.4633594751358032, + "learning_rate": 4.380574826560952e-06, + "loss": 1.0564, + "step": 253 + }, + { + "epoch": 0.45276292335115864, + "grad_norm": 1.4649094343185425, + "learning_rate": 4.37561942517344e-06, + "loss": 1.0152, + "step": 254 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 1.3923169374465942, + "learning_rate": 4.370664023785927e-06, + "loss": 1.049, + "step": 255 + }, + { + "epoch": 0.4563279857397504, + "grad_norm": 1.3575490713119507, + "learning_rate": 4.365708622398414e-06, + "loss": 1.0206, + "step": 256 + }, + { + "epoch": 0.45811051693404636, + "grad_norm": 1.5358511209487915, + "learning_rate": 4.3607532210109024e-06, + "loss": 1.0049, + "step": 257 + }, + { + "epoch": 0.45989304812834225, + "grad_norm": 1.4974987506866455, + "learning_rate": 4.35579781962339e-06, + "loss": 1.0512, + "step": 258 + }, + { + "epoch": 0.46167557932263814, + "grad_norm": 1.3226289749145508, + "learning_rate": 4.350842418235878e-06, + "loss": 1.0149, + "step": 259 + }, + { + "epoch": 0.46345811051693403, + "grad_norm": 1.4186517000198364, + "learning_rate": 4.345887016848365e-06, + "loss": 1.0451, + "step": 260 + }, + { + "epoch": 0.46345811051693403, + "eval_loss": 1.0856417417526245, + "eval_runtime": 25.8374, + "eval_samples_per_second": 38.704, + "eval_steps_per_second": 2.438, + "step": 260 + }, + { + "epoch": 0.46524064171123, + "grad_norm": 1.5120983123779297, + "learning_rate": 4.340931615460853e-06, + "loss": 1.0525, + "step": 261 + }, + { + "epoch": 0.46702317290552586, + "grad_norm": 1.4796828031539917, + "learning_rate": 4.33597621407334e-06, + "loss": 1.0412, + "step": 262 + }, + { + "epoch": 0.46880570409982175, + "grad_norm": 1.4043370485305786, + "learning_rate": 4.3310208126858276e-06, + "loss": 1.0352, + "step": 263 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 1.454528570175171, + "learning_rate": 4.326065411298316e-06, + "loss": 1.0432, + "step": 264 + }, + { + "epoch": 0.47237076648841353, + "grad_norm": 1.3608475923538208, + "learning_rate": 4.321110009910804e-06, + "loss": 1.0538, + "step": 265 + }, + { + "epoch": 0.4741532976827095, + "grad_norm": 1.4900518655776978, + "learning_rate": 4.316154608523291e-06, + "loss": 1.0163, + "step": 266 + }, + { + "epoch": 0.47593582887700536, + "grad_norm": 1.5197362899780273, + "learning_rate": 4.311199207135778e-06, + "loss": 1.0483, + "step": 267 + }, + { + "epoch": 0.47771836007130125, + "grad_norm": 1.4927443265914917, + "learning_rate": 4.3062438057482654e-06, + "loss": 1.0171, + "step": 268 + }, + { + "epoch": 0.47950089126559714, + "grad_norm": 1.779068946838379, + "learning_rate": 4.3012884043607535e-06, + "loss": 1.0583, + "step": 269 + }, + { + "epoch": 0.48128342245989303, + "grad_norm": 1.5883512496948242, + "learning_rate": 4.296333002973242e-06, + "loss": 1.052, + "step": 270 + }, + { + "epoch": 0.48128342245989303, + "eval_loss": 1.0838474035263062, + "eval_runtime": 26.0374, + "eval_samples_per_second": 38.406, + "eval_steps_per_second": 2.42, + "step": 270 + }, + { + "epoch": 0.483065953654189, + "grad_norm": 1.470141887664795, + "learning_rate": 4.291377601585729e-06, + "loss": 0.9976, + "step": 271 + }, + { + "epoch": 0.48484848484848486, + "grad_norm": 1.4511467218399048, + "learning_rate": 4.286422200198216e-06, + "loss": 1.0318, + "step": 272 + }, + { + "epoch": 0.48663101604278075, + "grad_norm": 1.396666169166565, + "learning_rate": 4.281466798810704e-06, + "loss": 1.0177, + "step": 273 + }, + { + "epoch": 0.48841354723707664, + "grad_norm": 1.4303821325302124, + "learning_rate": 4.276511397423191e-06, + "loss": 1.0385, + "step": 274 + }, + { + "epoch": 0.49019607843137253, + "grad_norm": 1.5581824779510498, + "learning_rate": 4.2715559960356795e-06, + "loss": 1.0388, + "step": 275 + }, + { + "epoch": 0.4919786096256685, + "grad_norm": 1.495390772819519, + "learning_rate": 4.266600594648167e-06, + "loss": 1.0154, + "step": 276 + }, + { + "epoch": 0.49376114081996436, + "grad_norm": 1.3809154033660889, + "learning_rate": 4.261645193260655e-06, + "loss": 1.0157, + "step": 277 + }, + { + "epoch": 0.49554367201426025, + "grad_norm": 1.3862433433532715, + "learning_rate": 4.256689791873142e-06, + "loss": 1.0113, + "step": 278 + }, + { + "epoch": 0.49732620320855614, + "grad_norm": 1.477658748626709, + "learning_rate": 4.251734390485629e-06, + "loss": 1.001, + "step": 279 + }, + { + "epoch": 0.49910873440285203, + "grad_norm": 1.4587225914001465, + "learning_rate": 4.246778989098117e-06, + "loss": 1.0357, + "step": 280 + }, + { + "epoch": 0.49910873440285203, + "eval_loss": 1.0841108560562134, + "eval_runtime": 25.711, + "eval_samples_per_second": 38.894, + "eval_steps_per_second": 2.45, + "step": 280 + }, + { + "epoch": 0.5008912655971479, + "grad_norm": 1.6233290433883667, + "learning_rate": 4.2418235877106055e-06, + "loss": 1.0157, + "step": 281 + }, + { + "epoch": 0.5026737967914439, + "grad_norm": 1.6783347129821777, + "learning_rate": 4.236868186323093e-06, + "loss": 0.9985, + "step": 282 + }, + { + "epoch": 0.5044563279857398, + "grad_norm": 1.5126416683197021, + "learning_rate": 4.23191278493558e-06, + "loss": 1.0632, + "step": 283 + }, + { + "epoch": 0.5062388591800356, + "grad_norm": 1.4921975135803223, + "learning_rate": 4.226957383548068e-06, + "loss": 1.0335, + "step": 284 + }, + { + "epoch": 0.5080213903743316, + "grad_norm": 1.457830548286438, + "learning_rate": 4.222001982160555e-06, + "loss": 1.049, + "step": 285 + }, + { + "epoch": 0.5098039215686274, + "grad_norm": 1.5393613576889038, + "learning_rate": 4.217046580773043e-06, + "loss": 1.0711, + "step": 286 + }, + { + "epoch": 0.5115864527629234, + "grad_norm": 1.4901129007339478, + "learning_rate": 4.212091179385531e-06, + "loss": 0.9559, + "step": 287 + }, + { + "epoch": 0.5133689839572193, + "grad_norm": 1.6378134489059448, + "learning_rate": 4.207135777998018e-06, + "loss": 1.0415, + "step": 288 + }, + { + "epoch": 0.5151515151515151, + "grad_norm": 1.7938212156295776, + "learning_rate": 4.202180376610506e-06, + "loss": 1.0231, + "step": 289 + }, + { + "epoch": 0.5169340463458111, + "grad_norm": 1.424949288368225, + "learning_rate": 4.197224975222993e-06, + "loss": 1.0148, + "step": 290 + }, + { + "epoch": 0.5169340463458111, + "eval_loss": 1.078644037246704, + "eval_runtime": 25.9288, + "eval_samples_per_second": 38.567, + "eval_steps_per_second": 2.43, + "step": 290 + }, + { + "epoch": 0.5187165775401069, + "grad_norm": 1.4257686138153076, + "learning_rate": 4.192269573835481e-06, + "loss": 1.015, + "step": 291 + }, + { + "epoch": 0.5204991087344029, + "grad_norm": 1.4987152814865112, + "learning_rate": 4.1873141724479685e-06, + "loss": 1.026, + "step": 292 + }, + { + "epoch": 0.5222816399286988, + "grad_norm": 1.380038857460022, + "learning_rate": 4.1823587710604566e-06, + "loss": 1.032, + "step": 293 + }, + { + "epoch": 0.5240641711229946, + "grad_norm": 1.421718955039978, + "learning_rate": 4.177403369672944e-06, + "loss": 0.9999, + "step": 294 + }, + { + "epoch": 0.5258467023172906, + "grad_norm": 1.524937391281128, + "learning_rate": 4.172447968285431e-06, + "loss": 1.0144, + "step": 295 + }, + { + "epoch": 0.5276292335115864, + "grad_norm": 1.4932767152786255, + "learning_rate": 4.167492566897919e-06, + "loss": 1.011, + "step": 296 + }, + { + "epoch": 0.5294117647058824, + "grad_norm": 1.4470241069793701, + "learning_rate": 4.162537165510407e-06, + "loss": 1.0085, + "step": 297 + }, + { + "epoch": 0.5311942959001783, + "grad_norm": 1.337119460105896, + "learning_rate": 4.1575817641228945e-06, + "loss": 1.0414, + "step": 298 + }, + { + "epoch": 0.5329768270944741, + "grad_norm": 1.4251195192337036, + "learning_rate": 4.152626362735382e-06, + "loss": 1.0166, + "step": 299 + }, + { + "epoch": 0.5347593582887701, + "grad_norm": 1.4127418994903564, + "learning_rate": 4.14767096134787e-06, + "loss": 1.0327, + "step": 300 + }, + { + "epoch": 0.5347593582887701, + "eval_loss": 1.0737465620040894, + "eval_runtime": 25.8837, + "eval_samples_per_second": 38.634, + "eval_steps_per_second": 2.434, + "step": 300 + }, + { + "epoch": 0.5365418894830659, + "grad_norm": 1.32978355884552, + "learning_rate": 4.142715559960357e-06, + "loss": 1.0267, + "step": 301 + }, + { + "epoch": 0.5383244206773619, + "grad_norm": 1.5129624605178833, + "learning_rate": 4.137760158572845e-06, + "loss": 1.052, + "step": 302 + }, + { + "epoch": 0.5401069518716578, + "grad_norm": 1.4814997911453247, + "learning_rate": 4.132804757185332e-06, + "loss": 1.0316, + "step": 303 + }, + { + "epoch": 0.5418894830659536, + "grad_norm": 1.494928240776062, + "learning_rate": 4.12784935579782e-06, + "loss": 1.0099, + "step": 304 + }, + { + "epoch": 0.5436720142602496, + "grad_norm": 1.471745491027832, + "learning_rate": 4.122893954410308e-06, + "loss": 1.0483, + "step": 305 + }, + { + "epoch": 0.5454545454545454, + "grad_norm": 1.4592864513397217, + "learning_rate": 4.117938553022795e-06, + "loss": 0.9876, + "step": 306 + }, + { + "epoch": 0.5472370766488414, + "grad_norm": 1.4757778644561768, + "learning_rate": 4.112983151635283e-06, + "loss": 1.0074, + "step": 307 + }, + { + "epoch": 0.5490196078431373, + "grad_norm": 1.50422203540802, + "learning_rate": 4.10802775024777e-06, + "loss": 1.0246, + "step": 308 + }, + { + "epoch": 0.5508021390374331, + "grad_norm": 1.3828375339508057, + "learning_rate": 4.103072348860258e-06, + "loss": 1.0216, + "step": 309 + }, + { + "epoch": 0.5525846702317291, + "grad_norm": 1.385804295539856, + "learning_rate": 4.0981169474727456e-06, + "loss": 1.0072, + "step": 310 + }, + { + "epoch": 0.5525846702317291, + "eval_loss": 1.0722776651382446, + "eval_runtime": 25.7006, + "eval_samples_per_second": 38.91, + "eval_steps_per_second": 2.451, + "step": 310 + }, + { + "epoch": 0.5543672014260249, + "grad_norm": 1.4588433504104614, + "learning_rate": 4.093161546085233e-06, + "loss": 1.0031, + "step": 311 + }, + { + "epoch": 0.5561497326203209, + "grad_norm": 1.4075288772583008, + "learning_rate": 4.088206144697721e-06, + "loss": 1.013, + "step": 312 + }, + { + "epoch": 0.5579322638146168, + "grad_norm": 1.495343804359436, + "learning_rate": 4.083250743310209e-06, + "loss": 1.005, + "step": 313 + }, + { + "epoch": 0.5597147950089126, + "grad_norm": 1.4959484338760376, + "learning_rate": 4.078295341922696e-06, + "loss": 0.989, + "step": 314 + }, + { + "epoch": 0.5614973262032086, + "grad_norm": 1.5026849508285522, + "learning_rate": 4.0733399405351834e-06, + "loss": 0.9741, + "step": 315 + }, + { + "epoch": 0.5632798573975044, + "grad_norm": 1.5046217441558838, + "learning_rate": 4.0683845391476715e-06, + "loss": 1.0406, + "step": 316 + }, + { + "epoch": 0.5650623885918004, + "grad_norm": 1.3469237089157104, + "learning_rate": 4.063429137760159e-06, + "loss": 1.0313, + "step": 317 + }, + { + "epoch": 0.5668449197860963, + "grad_norm": 1.5199836492538452, + "learning_rate": 4.058473736372647e-06, + "loss": 1.0091, + "step": 318 + }, + { + "epoch": 0.5686274509803921, + "grad_norm": 1.3804373741149902, + "learning_rate": 4.053518334985134e-06, + "loss": 1.0049, + "step": 319 + }, + { + "epoch": 0.5704099821746881, + "grad_norm": 1.4723666906356812, + "learning_rate": 4.048562933597622e-06, + "loss": 1.0306, + "step": 320 + }, + { + "epoch": 0.5704099821746881, + "eval_loss": 1.0686110258102417, + "eval_runtime": 25.8098, + "eval_samples_per_second": 38.745, + "eval_steps_per_second": 2.441, + "step": 320 + }, + { + "epoch": 0.5721925133689839, + "grad_norm": 1.493898630142212, + "learning_rate": 4.043607532210109e-06, + "loss": 0.9967, + "step": 321 + }, + { + "epoch": 0.5739750445632799, + "grad_norm": 1.5348026752471924, + "learning_rate": 4.038652130822597e-06, + "loss": 0.9936, + "step": 322 + }, + { + "epoch": 0.5757575757575758, + "grad_norm": 1.5064849853515625, + "learning_rate": 4.033696729435085e-06, + "loss": 0.9789, + "step": 323 + }, + { + "epoch": 0.5775401069518716, + "grad_norm": 1.5011988878250122, + "learning_rate": 4.028741328047572e-06, + "loss": 1.0053, + "step": 324 + }, + { + "epoch": 0.5793226381461676, + "grad_norm": 1.689141035079956, + "learning_rate": 4.02378592666006e-06, + "loss": 0.9868, + "step": 325 + }, + { + "epoch": 0.5811051693404634, + "grad_norm": 1.4876413345336914, + "learning_rate": 4.018830525272547e-06, + "loss": 1.0328, + "step": 326 + }, + { + "epoch": 0.5828877005347594, + "grad_norm": 1.6670453548431396, + "learning_rate": 4.0138751238850345e-06, + "loss": 1.0149, + "step": 327 + }, + { + "epoch": 0.5846702317290553, + "grad_norm": 1.6840277910232544, + "learning_rate": 4.008919722497523e-06, + "loss": 1.0407, + "step": 328 + }, + { + "epoch": 0.5864527629233511, + "grad_norm": 1.5689325332641602, + "learning_rate": 4.003964321110011e-06, + "loss": 0.9809, + "step": 329 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 1.5473581552505493, + "learning_rate": 3.999008919722498e-06, + "loss": 0.9578, + "step": 330 + }, + { + "epoch": 0.5882352941176471, + "eval_loss": 1.0670527219772339, + "eval_runtime": 25.7616, + "eval_samples_per_second": 38.818, + "eval_steps_per_second": 2.446, + "step": 330 + }, + { + "epoch": 0.5900178253119429, + "grad_norm": 1.6675747632980347, + "learning_rate": 3.994053518334985e-06, + "loss": 1.0227, + "step": 331 + }, + { + "epoch": 0.5918003565062389, + "grad_norm": 1.5776814222335815, + "learning_rate": 3.989098116947473e-06, + "loss": 1.0151, + "step": 332 + }, + { + "epoch": 0.5935828877005348, + "grad_norm": 1.6494024991989136, + "learning_rate": 3.9841427155599605e-06, + "loss": 0.9756, + "step": 333 + }, + { + "epoch": 0.5953654188948306, + "grad_norm": 1.7824499607086182, + "learning_rate": 3.979187314172449e-06, + "loss": 1.0147, + "step": 334 + }, + { + "epoch": 0.5971479500891266, + "grad_norm": 1.4496989250183105, + "learning_rate": 3.974231912784936e-06, + "loss": 1.0363, + "step": 335 + }, + { + "epoch": 0.5989304812834224, + "grad_norm": 1.5654771327972412, + "learning_rate": 3.969276511397424e-06, + "loss": 1.0148, + "step": 336 + }, + { + "epoch": 0.6007130124777184, + "grad_norm": 1.6020286083221436, + "learning_rate": 3.964321110009911e-06, + "loss": 0.9781, + "step": 337 + }, + { + "epoch": 0.6024955436720143, + "grad_norm": 1.539452314376831, + "learning_rate": 3.959365708622398e-06, + "loss": 0.9959, + "step": 338 + }, + { + "epoch": 0.6042780748663101, + "grad_norm": 1.436840534210205, + "learning_rate": 3.9544103072348865e-06, + "loss": 1.0092, + "step": 339 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 1.5760927200317383, + "learning_rate": 3.949454905847374e-06, + "loss": 1.0148, + "step": 340 + }, + { + "epoch": 0.6060606060606061, + "eval_loss": 1.064415693283081, + "eval_runtime": 25.7611, + "eval_samples_per_second": 38.818, + "eval_steps_per_second": 2.446, + "step": 340 + }, + { + "epoch": 0.6078431372549019, + "grad_norm": 1.603229284286499, + "learning_rate": 3.944499504459862e-06, + "loss": 1.0011, + "step": 341 + }, + { + "epoch": 0.6096256684491979, + "grad_norm": 1.5881043672561646, + "learning_rate": 3.939544103072349e-06, + "loss": 1.0572, + "step": 342 + }, + { + "epoch": 0.6114081996434938, + "grad_norm": 1.5322010517120361, + "learning_rate": 3.934588701684836e-06, + "loss": 1.0401, + "step": 343 + }, + { + "epoch": 0.6131907308377896, + "grad_norm": 1.4483110904693604, + "learning_rate": 3.929633300297324e-06, + "loss": 1.0, + "step": 344 + }, + { + "epoch": 0.6149732620320856, + "grad_norm": 1.4452067613601685, + "learning_rate": 3.9246778989098124e-06, + "loss": 0.9962, + "step": 345 + }, + { + "epoch": 0.6167557932263814, + "grad_norm": 1.4547947645187378, + "learning_rate": 3.9197224975223e-06, + "loss": 0.9923, + "step": 346 + }, + { + "epoch": 0.6185383244206774, + "grad_norm": 1.5050512552261353, + "learning_rate": 3.914767096134787e-06, + "loss": 0.9867, + "step": 347 + }, + { + "epoch": 0.6203208556149733, + "grad_norm": 1.4761282205581665, + "learning_rate": 3.909811694747275e-06, + "loss": 1.023, + "step": 348 + }, + { + "epoch": 0.6221033868092691, + "grad_norm": 1.5492281913757324, + "learning_rate": 3.904856293359762e-06, + "loss": 1.003, + "step": 349 + }, + { + "epoch": 0.6238859180035651, + "grad_norm": 1.4964330196380615, + "learning_rate": 3.89990089197225e-06, + "loss": 0.981, + "step": 350 + }, + { + "epoch": 0.6238859180035651, + "eval_loss": 1.0592336654663086, + "eval_runtime": 25.7609, + "eval_samples_per_second": 38.819, + "eval_steps_per_second": 2.446, + "step": 350 + }, + { + "epoch": 0.6256684491978609, + "grad_norm": 1.416474461555481, + "learning_rate": 3.8949454905847376e-06, + "loss": 0.9831, + "step": 351 + }, + { + "epoch": 0.6274509803921569, + "grad_norm": 1.5389848947525024, + "learning_rate": 3.889990089197226e-06, + "loss": 1.0028, + "step": 352 + }, + { + "epoch": 0.6292335115864528, + "grad_norm": 1.4282304048538208, + "learning_rate": 3.885034687809713e-06, + "loss": 0.9946, + "step": 353 + }, + { + "epoch": 0.6310160427807486, + "grad_norm": 1.5408443212509155, + "learning_rate": 3.8800792864222e-06, + "loss": 0.9994, + "step": 354 + }, + { + "epoch": 0.6327985739750446, + "grad_norm": 1.5303212404251099, + "learning_rate": 3.875123885034688e-06, + "loss": 0.993, + "step": 355 + }, + { + "epoch": 0.6345811051693404, + "grad_norm": 1.50641930103302, + "learning_rate": 3.870168483647176e-06, + "loss": 0.9956, + "step": 356 + }, + { + "epoch": 0.6363636363636364, + "grad_norm": 1.5365883111953735, + "learning_rate": 3.8652130822596635e-06, + "loss": 0.9948, + "step": 357 + }, + { + "epoch": 0.6381461675579323, + "grad_norm": 1.6365007162094116, + "learning_rate": 3.860257680872151e-06, + "loss": 0.9991, + "step": 358 + }, + { + "epoch": 0.6399286987522281, + "grad_norm": 1.6538429260253906, + "learning_rate": 3.855302279484638e-06, + "loss": 1.0, + "step": 359 + }, + { + "epoch": 0.6417112299465241, + "grad_norm": 1.5091899633407593, + "learning_rate": 3.850346878097126e-06, + "loss": 1.0018, + "step": 360 + }, + { + "epoch": 0.6417112299465241, + "eval_loss": 1.057335376739502, + "eval_runtime": 25.5877, + "eval_samples_per_second": 39.081, + "eval_steps_per_second": 2.462, + "step": 360 + }, + { + "epoch": 0.64349376114082, + "grad_norm": 1.4903072118759155, + "learning_rate": 3.845391476709614e-06, + "loss": 1.0115, + "step": 361 + }, + { + "epoch": 0.6452762923351159, + "grad_norm": 1.663288950920105, + "learning_rate": 3.8404360753221014e-06, + "loss": 0.9848, + "step": 362 + }, + { + "epoch": 0.6470588235294118, + "grad_norm": 1.6192997694015503, + "learning_rate": 3.835480673934589e-06, + "loss": 1.0114, + "step": 363 + }, + { + "epoch": 0.6488413547237076, + "grad_norm": 1.487221360206604, + "learning_rate": 3.830525272547077e-06, + "loss": 0.9734, + "step": 364 + }, + { + "epoch": 0.6506238859180036, + "grad_norm": 1.6606189012527466, + "learning_rate": 3.825569871159564e-06, + "loss": 1.012, + "step": 365 + }, + { + "epoch": 0.6524064171122995, + "grad_norm": 1.5607527494430542, + "learning_rate": 3.820614469772052e-06, + "loss": 0.9784, + "step": 366 + }, + { + "epoch": 0.6541889483065954, + "grad_norm": 1.5367454290390015, + "learning_rate": 3.815659068384539e-06, + "loss": 0.9965, + "step": 367 + }, + { + "epoch": 0.6559714795008913, + "grad_norm": 1.595745325088501, + "learning_rate": 3.810703666997027e-06, + "loss": 1.0008, + "step": 368 + }, + { + "epoch": 0.6577540106951871, + "grad_norm": 1.6348439455032349, + "learning_rate": 3.8057482656095146e-06, + "loss": 1.0033, + "step": 369 + }, + { + "epoch": 0.6595365418894831, + "grad_norm": 1.5682718753814697, + "learning_rate": 3.8007928642220023e-06, + "loss": 0.9946, + "step": 370 + }, + { + "epoch": 0.6595365418894831, + "eval_loss": 1.0564712285995483, + "eval_runtime": 25.6509, + "eval_samples_per_second": 38.985, + "eval_steps_per_second": 2.456, + "step": 370 + }, + { + "epoch": 0.661319073083779, + "grad_norm": 1.650360107421875, + "learning_rate": 3.79583746283449e-06, + "loss": 1.0033, + "step": 371 + }, + { + "epoch": 0.6631016042780749, + "grad_norm": 1.5877972841262817, + "learning_rate": 3.7908820614469776e-06, + "loss": 1.0059, + "step": 372 + }, + { + "epoch": 0.6648841354723708, + "grad_norm": 1.4824687242507935, + "learning_rate": 3.7859266600594653e-06, + "loss": 0.9776, + "step": 373 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 1.5661587715148926, + "learning_rate": 3.7809712586719525e-06, + "loss": 0.9608, + "step": 374 + }, + { + "epoch": 0.6684491978609626, + "grad_norm": 1.5237559080123901, + "learning_rate": 3.77601585728444e-06, + "loss": 0.9967, + "step": 375 + }, + { + "epoch": 0.6702317290552585, + "grad_norm": 1.42563796043396, + "learning_rate": 3.771060455896928e-06, + "loss": 1.0168, + "step": 376 + }, + { + "epoch": 0.6720142602495544, + "grad_norm": 1.4831016063690186, + "learning_rate": 3.766105054509416e-06, + "loss": 1.0032, + "step": 377 + }, + { + "epoch": 0.6737967914438503, + "grad_norm": 1.4618051052093506, + "learning_rate": 3.761149653121903e-06, + "loss": 0.9715, + "step": 378 + }, + { + "epoch": 0.6755793226381461, + "grad_norm": 1.5617003440856934, + "learning_rate": 3.756194251734391e-06, + "loss": 0.9921, + "step": 379 + }, + { + "epoch": 0.6773618538324421, + "grad_norm": 1.4664802551269531, + "learning_rate": 3.7512388503468785e-06, + "loss": 0.9886, + "step": 380 + }, + { + "epoch": 0.6773618538324421, + "eval_loss": 1.0536266565322876, + "eval_runtime": 25.8266, + "eval_samples_per_second": 38.72, + "eval_steps_per_second": 2.439, + "step": 380 + }, + { + "epoch": 0.679144385026738, + "grad_norm": 1.4738689661026, + "learning_rate": 3.7462834489593657e-06, + "loss": 0.9894, + "step": 381 + }, + { + "epoch": 0.6809269162210339, + "grad_norm": 1.4868427515029907, + "learning_rate": 3.741328047571854e-06, + "loss": 0.9855, + "step": 382 + }, + { + "epoch": 0.6827094474153298, + "grad_norm": 1.4021263122558594, + "learning_rate": 3.7363726461843415e-06, + "loss": 1.0022, + "step": 383 + }, + { + "epoch": 0.6844919786096256, + "grad_norm": 1.432822585105896, + "learning_rate": 3.7314172447968287e-06, + "loss": 0.923, + "step": 384 + }, + { + "epoch": 0.6862745098039216, + "grad_norm": 1.452248215675354, + "learning_rate": 3.7264618434093164e-06, + "loss": 0.9738, + "step": 385 + }, + { + "epoch": 0.6880570409982175, + "grad_norm": 1.4211044311523438, + "learning_rate": 3.721506442021804e-06, + "loss": 1.0073, + "step": 386 + }, + { + "epoch": 0.6898395721925134, + "grad_norm": 1.4883133172988892, + "learning_rate": 3.716551040634292e-06, + "loss": 0.9823, + "step": 387 + }, + { + "epoch": 0.6916221033868093, + "grad_norm": 1.6089807748794556, + "learning_rate": 3.7115956392467794e-06, + "loss": 1.0085, + "step": 388 + }, + { + "epoch": 0.6934046345811051, + "grad_norm": 1.4423185586929321, + "learning_rate": 3.706640237859267e-06, + "loss": 0.9764, + "step": 389 + }, + { + "epoch": 0.6951871657754011, + "grad_norm": 1.5661147832870483, + "learning_rate": 3.7016848364717543e-06, + "loss": 0.9908, + "step": 390 + }, + { + "epoch": 0.6951871657754011, + "eval_loss": 1.0515977144241333, + "eval_runtime": 25.7048, + "eval_samples_per_second": 38.903, + "eval_steps_per_second": 2.451, + "step": 390 + }, + { + "epoch": 0.696969696969697, + "grad_norm": 1.5067543983459473, + "learning_rate": 3.696729435084242e-06, + "loss": 0.993, + "step": 391 + }, + { + "epoch": 0.6987522281639929, + "grad_norm": 1.484405279159546, + "learning_rate": 3.6917740336967296e-06, + "loss": 0.9635, + "step": 392 + }, + { + "epoch": 0.7005347593582888, + "grad_norm": 1.4555736780166626, + "learning_rate": 3.6868186323092177e-06, + "loss": 0.9775, + "step": 393 + }, + { + "epoch": 0.7023172905525846, + "grad_norm": 1.4570845365524292, + "learning_rate": 3.681863230921705e-06, + "loss": 1.0116, + "step": 394 + }, + { + "epoch": 0.7040998217468806, + "grad_norm": 1.5175104141235352, + "learning_rate": 3.6769078295341926e-06, + "loss": 0.9856, + "step": 395 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 1.4439493417739868, + "learning_rate": 3.6719524281466802e-06, + "loss": 0.9591, + "step": 396 + }, + { + "epoch": 0.7076648841354723, + "grad_norm": 1.4264500141143799, + "learning_rate": 3.6669970267591675e-06, + "loss": 0.9894, + "step": 397 + }, + { + "epoch": 0.7094474153297683, + "grad_norm": 1.4578537940979004, + "learning_rate": 3.6620416253716556e-06, + "loss": 0.9867, + "step": 398 + }, + { + "epoch": 0.7112299465240641, + "grad_norm": 1.436010479927063, + "learning_rate": 3.6570862239841432e-06, + "loss": 1.0047, + "step": 399 + }, + { + "epoch": 0.7130124777183601, + "grad_norm": 1.446648359298706, + "learning_rate": 3.6521308225966305e-06, + "loss": 1.0164, + "step": 400 + }, + { + "epoch": 0.7130124777183601, + "eval_loss": 1.050747275352478, + "eval_runtime": 25.6718, + "eval_samples_per_second": 38.953, + "eval_steps_per_second": 2.454, + "step": 400 + }, + { + "epoch": 0.714795008912656, + "grad_norm": 1.4584800004959106, + "learning_rate": 3.647175421209118e-06, + "loss": 0.9755, + "step": 401 + }, + { + "epoch": 0.7165775401069518, + "grad_norm": 1.444089412689209, + "learning_rate": 3.6422200198216058e-06, + "loss": 0.9861, + "step": 402 + }, + { + "epoch": 0.7183600713012478, + "grad_norm": 1.522937297821045, + "learning_rate": 3.637264618434094e-06, + "loss": 0.9904, + "step": 403 + }, + { + "epoch": 0.7201426024955436, + "grad_norm": 1.4142314195632935, + "learning_rate": 3.632309217046581e-06, + "loss": 0.9715, + "step": 404 + }, + { + "epoch": 0.7219251336898396, + "grad_norm": 1.4843522310256958, + "learning_rate": 3.6273538156590688e-06, + "loss": 0.9878, + "step": 405 + }, + { + "epoch": 0.7237076648841355, + "grad_norm": 1.4929953813552856, + "learning_rate": 3.6223984142715564e-06, + "loss": 1.0019, + "step": 406 + }, + { + "epoch": 0.7254901960784313, + "grad_norm": 1.4887545108795166, + "learning_rate": 3.6174430128840437e-06, + "loss": 1.0063, + "step": 407 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 1.4991189241409302, + "learning_rate": 3.6124876114965313e-06, + "loss": 0.9987, + "step": 408 + }, + { + "epoch": 0.7290552584670231, + "grad_norm": 1.4522265195846558, + "learning_rate": 3.6075322101090194e-06, + "loss": 0.961, + "step": 409 + }, + { + "epoch": 0.7308377896613191, + "grad_norm": 1.473073959350586, + "learning_rate": 3.6025768087215067e-06, + "loss": 0.9625, + "step": 410 + }, + { + "epoch": 0.7308377896613191, + "eval_loss": 1.0513505935668945, + "eval_runtime": 25.7751, + "eval_samples_per_second": 38.797, + "eval_steps_per_second": 2.444, + "step": 410 + }, + { + "epoch": 0.732620320855615, + "grad_norm": 1.7087706327438354, + "learning_rate": 3.5976214073339943e-06, + "loss": 1.0139, + "step": 411 + }, + { + "epoch": 0.7344028520499108, + "grad_norm": 1.4385508298873901, + "learning_rate": 3.592666005946482e-06, + "loss": 0.9584, + "step": 412 + }, + { + "epoch": 0.7361853832442068, + "grad_norm": 1.525636911392212, + "learning_rate": 3.5877106045589692e-06, + "loss": 1.0031, + "step": 413 + }, + { + "epoch": 0.7379679144385026, + "grad_norm": 1.487068772315979, + "learning_rate": 3.5827552031714573e-06, + "loss": 1.0173, + "step": 414 + }, + { + "epoch": 0.7397504456327986, + "grad_norm": 1.4463266134262085, + "learning_rate": 3.577799801783945e-06, + "loss": 0.9762, + "step": 415 + }, + { + "epoch": 0.7415329768270945, + "grad_norm": 1.5234814882278442, + "learning_rate": 3.5728444003964326e-06, + "loss": 0.9776, + "step": 416 + }, + { + "epoch": 0.7433155080213903, + "grad_norm": 1.4824904203414917, + "learning_rate": 3.56788899900892e-06, + "loss": 0.9981, + "step": 417 + }, + { + "epoch": 0.7450980392156863, + "grad_norm": 1.51459538936615, + "learning_rate": 3.5629335976214075e-06, + "loss": 0.9542, + "step": 418 + }, + { + "epoch": 0.7468805704099821, + "grad_norm": 1.5176249742507935, + "learning_rate": 3.5579781962338956e-06, + "loss": 0.9655, + "step": 419 + }, + { + "epoch": 0.7486631016042781, + "grad_norm": 1.840696930885315, + "learning_rate": 3.553022794846383e-06, + "loss": 0.9948, + "step": 420 + }, + { + "epoch": 0.7486631016042781, + "eval_loss": 1.0480048656463623, + "eval_runtime": 26.0675, + "eval_samples_per_second": 38.362, + "eval_steps_per_second": 2.417, + "step": 420 + }, + { + "epoch": 0.750445632798574, + "grad_norm": 1.5904754400253296, + "learning_rate": 3.5480673934588705e-06, + "loss": 0.9577, + "step": 421 + }, + { + "epoch": 0.7522281639928698, + "grad_norm": 1.5453381538391113, + "learning_rate": 3.543111992071358e-06, + "loss": 0.9968, + "step": 422 + }, + { + "epoch": 0.7540106951871658, + "grad_norm": 1.5720494985580444, + "learning_rate": 3.5381565906838454e-06, + "loss": 0.963, + "step": 423 + }, + { + "epoch": 0.7557932263814616, + "grad_norm": 1.4631563425064087, + "learning_rate": 3.533201189296333e-06, + "loss": 0.9987, + "step": 424 + }, + { + "epoch": 0.7575757575757576, + "grad_norm": 1.5536762475967407, + "learning_rate": 3.528245787908821e-06, + "loss": 0.9579, + "step": 425 + }, + { + "epoch": 0.7593582887700535, + "grad_norm": 1.602889895439148, + "learning_rate": 3.523290386521309e-06, + "loss": 1.0036, + "step": 426 + }, + { + "epoch": 0.7611408199643493, + "grad_norm": 1.6541874408721924, + "learning_rate": 3.518334985133796e-06, + "loss": 0.9689, + "step": 427 + }, + { + "epoch": 0.7629233511586453, + "grad_norm": 1.529970407485962, + "learning_rate": 3.5133795837462837e-06, + "loss": 0.9577, + "step": 428 + }, + { + "epoch": 0.7647058823529411, + "grad_norm": 1.5926837921142578, + "learning_rate": 3.508424182358771e-06, + "loss": 0.9908, + "step": 429 + }, + { + "epoch": 0.7664884135472371, + "grad_norm": 1.5412956476211548, + "learning_rate": 3.503468780971259e-06, + "loss": 0.9699, + "step": 430 + }, + { + "epoch": 0.7664884135472371, + "eval_loss": 1.0452969074249268, + "eval_runtime": 25.7018, + "eval_samples_per_second": 38.908, + "eval_steps_per_second": 2.451, + "step": 430 + }, + { + "epoch": 0.768270944741533, + "grad_norm": 1.5507601499557495, + "learning_rate": 3.4985133795837467e-06, + "loss": 1.0164, + "step": 431 + }, + { + "epoch": 0.7700534759358288, + "grad_norm": 1.5753085613250732, + "learning_rate": 3.4935579781962344e-06, + "loss": 0.9694, + "step": 432 + }, + { + "epoch": 0.7718360071301248, + "grad_norm": 1.532804250717163, + "learning_rate": 3.4886025768087216e-06, + "loss": 0.9837, + "step": 433 + }, + { + "epoch": 0.7736185383244206, + "grad_norm": 1.7083972692489624, + "learning_rate": 3.4836471754212093e-06, + "loss": 1.0071, + "step": 434 + }, + { + "epoch": 0.7754010695187166, + "grad_norm": 1.5414382219314575, + "learning_rate": 3.4786917740336974e-06, + "loss": 0.9755, + "step": 435 + }, + { + "epoch": 0.7771836007130125, + "grad_norm": 1.4537928104400635, + "learning_rate": 3.4737363726461846e-06, + "loss": 0.9659, + "step": 436 + }, + { + "epoch": 0.7789661319073083, + "grad_norm": 1.486669898033142, + "learning_rate": 3.4687809712586723e-06, + "loss": 0.9741, + "step": 437 + }, + { + "epoch": 0.7807486631016043, + "grad_norm": 1.5523810386657715, + "learning_rate": 3.46382556987116e-06, + "loss": 0.9857, + "step": 438 + }, + { + "epoch": 0.7825311942959001, + "grad_norm": 1.5614720582962036, + "learning_rate": 3.458870168483647e-06, + "loss": 1.0088, + "step": 439 + }, + { + "epoch": 0.7843137254901961, + "grad_norm": 1.621520757675171, + "learning_rate": 3.453914767096135e-06, + "loss": 1.0051, + "step": 440 + }, + { + "epoch": 0.7843137254901961, + "eval_loss": 1.04201078414917, + "eval_runtime": 25.8799, + "eval_samples_per_second": 38.64, + "eval_steps_per_second": 2.434, + "step": 440 + }, + { + "epoch": 0.786096256684492, + "grad_norm": 1.5269745588302612, + "learning_rate": 3.448959365708623e-06, + "loss": 0.9874, + "step": 441 + }, + { + "epoch": 0.7878787878787878, + "grad_norm": 1.5022350549697876, + "learning_rate": 3.4440039643211106e-06, + "loss": 0.9661, + "step": 442 + }, + { + "epoch": 0.7896613190730838, + "grad_norm": 1.476470947265625, + "learning_rate": 3.439048562933598e-06, + "loss": 0.9626, + "step": 443 + }, + { + "epoch": 0.7914438502673797, + "grad_norm": 1.6092950105667114, + "learning_rate": 3.4340931615460855e-06, + "loss": 0.9828, + "step": 444 + }, + { + "epoch": 0.7932263814616756, + "grad_norm": 1.5567653179168701, + "learning_rate": 3.4291377601585727e-06, + "loss": 0.994, + "step": 445 + }, + { + "epoch": 0.7950089126559715, + "grad_norm": 1.4823105335235596, + "learning_rate": 3.424182358771061e-06, + "loss": 0.9619, + "step": 446 + }, + { + "epoch": 0.7967914438502673, + "grad_norm": 1.7685904502868652, + "learning_rate": 3.4192269573835485e-06, + "loss": 0.9475, + "step": 447 + }, + { + "epoch": 0.7985739750445633, + "grad_norm": 1.5113608837127686, + "learning_rate": 3.414271555996036e-06, + "loss": 0.9868, + "step": 448 + }, + { + "epoch": 0.8003565062388592, + "grad_norm": 1.6278724670410156, + "learning_rate": 3.4093161546085234e-06, + "loss": 0.9729, + "step": 449 + }, + { + "epoch": 0.8021390374331551, + "grad_norm": 1.922420859336853, + "learning_rate": 3.404360753221011e-06, + "loss": 1.0143, + "step": 450 + }, + { + "epoch": 0.8021390374331551, + "eval_loss": 1.0419155359268188, + "eval_runtime": 25.9263, + "eval_samples_per_second": 38.571, + "eval_steps_per_second": 2.43, + "step": 450 + }, + { + "epoch": 0.803921568627451, + "grad_norm": 1.523510456085205, + "learning_rate": 3.399405351833499e-06, + "loss": 0.9925, + "step": 451 + }, + { + "epoch": 0.8057040998217468, + "grad_norm": 1.5074394941329956, + "learning_rate": 3.3944499504459868e-06, + "loss": 0.9788, + "step": 452 + }, + { + "epoch": 0.8074866310160428, + "grad_norm": 1.5418741703033447, + "learning_rate": 3.389494549058474e-06, + "loss": 0.9669, + "step": 453 + }, + { + "epoch": 0.8092691622103387, + "grad_norm": 1.5402334928512573, + "learning_rate": 3.3845391476709617e-06, + "loss": 0.995, + "step": 454 + }, + { + "epoch": 0.8110516934046346, + "grad_norm": 1.4736061096191406, + "learning_rate": 3.379583746283449e-06, + "loss": 0.968, + "step": 455 + }, + { + "epoch": 0.8128342245989305, + "grad_norm": 1.7145874500274658, + "learning_rate": 3.3746283448959366e-06, + "loss": 0.9594, + "step": 456 + }, + { + "epoch": 0.8146167557932263, + "grad_norm": 1.7427315711975098, + "learning_rate": 3.3696729435084247e-06, + "loss": 0.9777, + "step": 457 + }, + { + "epoch": 0.8163992869875223, + "grad_norm": 1.5081549882888794, + "learning_rate": 3.3647175421209123e-06, + "loss": 0.9572, + "step": 458 + }, + { + "epoch": 0.8181818181818182, + "grad_norm": 1.608188271522522, + "learning_rate": 3.3597621407333996e-06, + "loss": 0.9906, + "step": 459 + }, + { + "epoch": 0.8199643493761141, + "grad_norm": 1.5751535892486572, + "learning_rate": 3.3548067393458872e-06, + "loss": 0.966, + "step": 460 + }, + { + "epoch": 0.8199643493761141, + "eval_loss": 1.0394279956817627, + "eval_runtime": 25.7927, + "eval_samples_per_second": 38.771, + "eval_steps_per_second": 2.443, + "step": 460 + }, + { + "epoch": 0.82174688057041, + "grad_norm": 1.4779298305511475, + "learning_rate": 3.349851337958375e-06, + "loss": 0.9953, + "step": 461 + }, + { + "epoch": 0.8235294117647058, + "grad_norm": 1.4367811679840088, + "learning_rate": 3.344895936570863e-06, + "loss": 0.9793, + "step": 462 + }, + { + "epoch": 0.8253119429590018, + "grad_norm": 1.6063673496246338, + "learning_rate": 3.33994053518335e-06, + "loss": 0.9619, + "step": 463 + }, + { + "epoch": 0.8270944741532977, + "grad_norm": 1.5322190523147583, + "learning_rate": 3.334985133795838e-06, + "loss": 0.9636, + "step": 464 + }, + { + "epoch": 0.8288770053475936, + "grad_norm": 1.6693592071533203, + "learning_rate": 3.330029732408325e-06, + "loss": 1.014, + "step": 465 + }, + { + "epoch": 0.8306595365418895, + "grad_norm": 1.6570430994033813, + "learning_rate": 3.3250743310208128e-06, + "loss": 0.9913, + "step": 466 + }, + { + "epoch": 0.8324420677361853, + "grad_norm": 1.5995551347732544, + "learning_rate": 3.3201189296333004e-06, + "loss": 0.9514, + "step": 467 + }, + { + "epoch": 0.8342245989304813, + "grad_norm": 1.528596043586731, + "learning_rate": 3.3151635282457885e-06, + "loss": 0.9817, + "step": 468 + }, + { + "epoch": 0.8360071301247772, + "grad_norm": 1.475396752357483, + "learning_rate": 3.3102081268582757e-06, + "loss": 0.9604, + "step": 469 + }, + { + "epoch": 0.8377896613190731, + "grad_norm": 1.5124799013137817, + "learning_rate": 3.3052527254707634e-06, + "loss": 0.9596, + "step": 470 + }, + { + "epoch": 0.8377896613190731, + "eval_loss": 1.0366942882537842, + "eval_runtime": 25.742, + "eval_samples_per_second": 38.847, + "eval_steps_per_second": 2.447, + "step": 470 + }, + { + "epoch": 0.839572192513369, + "grad_norm": 1.5007457733154297, + "learning_rate": 3.300297324083251e-06, + "loss": 0.9647, + "step": 471 + }, + { + "epoch": 0.8413547237076648, + "grad_norm": 1.4299620389938354, + "learning_rate": 3.2953419226957383e-06, + "loss": 0.9591, + "step": 472 + }, + { + "epoch": 0.8431372549019608, + "grad_norm": 1.4947385787963867, + "learning_rate": 3.2903865213082264e-06, + "loss": 0.971, + "step": 473 + }, + { + "epoch": 0.8449197860962567, + "grad_norm": 1.5945279598236084, + "learning_rate": 3.285431119920714e-06, + "loss": 1.0029, + "step": 474 + }, + { + "epoch": 0.8467023172905526, + "grad_norm": 1.6020610332489014, + "learning_rate": 3.2804757185332013e-06, + "loss": 0.9834, + "step": 475 + }, + { + "epoch": 0.8484848484848485, + "grad_norm": 1.6295793056488037, + "learning_rate": 3.275520317145689e-06, + "loss": 0.9778, + "step": 476 + }, + { + "epoch": 0.8502673796791443, + "grad_norm": 1.4652864933013916, + "learning_rate": 3.2705649157581766e-06, + "loss": 1.0236, + "step": 477 + }, + { + "epoch": 0.8520499108734403, + "grad_norm": 1.417965054512024, + "learning_rate": 3.2656095143706647e-06, + "loss": 0.9724, + "step": 478 + }, + { + "epoch": 0.8538324420677362, + "grad_norm": 1.449277400970459, + "learning_rate": 3.260654112983152e-06, + "loss": 0.9896, + "step": 479 + }, + { + "epoch": 0.8556149732620321, + "grad_norm": 1.4030529260635376, + "learning_rate": 3.2556987115956396e-06, + "loss": 0.9882, + "step": 480 + }, + { + "epoch": 0.8556149732620321, + "eval_loss": 1.0339490175247192, + "eval_runtime": 25.8189, + "eval_samples_per_second": 38.731, + "eval_steps_per_second": 2.44, + "step": 480 + }, + { + "epoch": 0.857397504456328, + "grad_norm": 1.5231982469558716, + "learning_rate": 3.2507433102081273e-06, + "loss": 0.964, + "step": 481 + }, + { + "epoch": 0.8591800356506238, + "grad_norm": 1.5029219388961792, + "learning_rate": 3.2457879088206145e-06, + "loss": 0.9774, + "step": 482 + }, + { + "epoch": 0.8609625668449198, + "grad_norm": 1.5686631202697754, + "learning_rate": 3.240832507433102e-06, + "loss": 0.959, + "step": 483 + }, + { + "epoch": 0.8627450980392157, + "grad_norm": 1.5515562295913696, + "learning_rate": 3.2358771060455903e-06, + "loss": 0.9367, + "step": 484 + }, + { + "epoch": 0.8645276292335116, + "grad_norm": 1.4003010988235474, + "learning_rate": 3.2309217046580775e-06, + "loss": 0.9754, + "step": 485 + }, + { + "epoch": 0.8663101604278075, + "grad_norm": 1.4356753826141357, + "learning_rate": 3.225966303270565e-06, + "loss": 0.9673, + "step": 486 + }, + { + "epoch": 0.8680926916221033, + "grad_norm": 1.5588560104370117, + "learning_rate": 3.221010901883053e-06, + "loss": 0.9821, + "step": 487 + }, + { + "epoch": 0.8698752228163993, + "grad_norm": 1.494897723197937, + "learning_rate": 3.21605550049554e-06, + "loss": 0.9595, + "step": 488 + }, + { + "epoch": 0.8716577540106952, + "grad_norm": 1.580748200416565, + "learning_rate": 3.211100099108028e-06, + "loss": 0.9258, + "step": 489 + }, + { + "epoch": 0.8734402852049911, + "grad_norm": 1.4712975025177002, + "learning_rate": 3.206144697720516e-06, + "loss": 0.9657, + "step": 490 + }, + { + "epoch": 0.8734402852049911, + "eval_loss": 1.0344802141189575, + "eval_runtime": 25.8547, + "eval_samples_per_second": 38.678, + "eval_steps_per_second": 2.437, + "step": 490 + }, + { + "epoch": 0.875222816399287, + "grad_norm": 1.4707386493682861, + "learning_rate": 3.201189296333003e-06, + "loss": 0.9597, + "step": 491 + }, + { + "epoch": 0.8770053475935828, + "grad_norm": 1.5678167343139648, + "learning_rate": 3.1962338949454907e-06, + "loss": 0.9745, + "step": 492 + }, + { + "epoch": 0.8787878787878788, + "grad_norm": 1.577392578125, + "learning_rate": 3.1912784935579784e-06, + "loss": 0.9644, + "step": 493 + }, + { + "epoch": 0.8805704099821747, + "grad_norm": 1.4401471614837646, + "learning_rate": 3.1863230921704664e-06, + "loss": 0.9563, + "step": 494 + }, + { + "epoch": 0.8823529411764706, + "grad_norm": 1.5576186180114746, + "learning_rate": 3.1813676907829537e-06, + "loss": 0.9665, + "step": 495 + }, + { + "epoch": 0.8841354723707665, + "grad_norm": 1.5939122438430786, + "learning_rate": 3.1764122893954413e-06, + "loss": 0.9746, + "step": 496 + }, + { + "epoch": 0.8859180035650623, + "grad_norm": 1.476230263710022, + "learning_rate": 3.171456888007929e-06, + "loss": 0.9479, + "step": 497 + }, + { + "epoch": 0.8877005347593583, + "grad_norm": 1.5572577714920044, + "learning_rate": 3.1665014866204162e-06, + "loss": 0.9643, + "step": 498 + }, + { + "epoch": 0.8894830659536542, + "grad_norm": 1.5189151763916016, + "learning_rate": 3.161546085232904e-06, + "loss": 0.9257, + "step": 499 + }, + { + "epoch": 0.8912655971479501, + "grad_norm": 1.5143529176712036, + "learning_rate": 3.156590683845392e-06, + "loss": 0.9706, + "step": 500 + }, + { + "epoch": 0.8912655971479501, + "eval_loss": 1.0346344709396362, + "eval_runtime": 26.07, + "eval_samples_per_second": 38.358, + "eval_steps_per_second": 2.417, + "step": 500 + }, + { + "epoch": 0.893048128342246, + "grad_norm": 1.5261365175247192, + "learning_rate": 3.1516352824578792e-06, + "loss": 0.9785, + "step": 501 + }, + { + "epoch": 0.8948306595365418, + "grad_norm": 1.5646567344665527, + "learning_rate": 3.146679881070367e-06, + "loss": 0.992, + "step": 502 + }, + { + "epoch": 0.8966131907308378, + "grad_norm": 1.5119503736495972, + "learning_rate": 3.1417244796828546e-06, + "loss": 0.9488, + "step": 503 + }, + { + "epoch": 0.8983957219251337, + "grad_norm": 1.5541421175003052, + "learning_rate": 3.136769078295342e-06, + "loss": 0.9617, + "step": 504 + }, + { + "epoch": 0.9001782531194296, + "grad_norm": 1.491257667541504, + "learning_rate": 3.13181367690783e-06, + "loss": 0.981, + "step": 505 + }, + { + "epoch": 0.9019607843137255, + "grad_norm": 1.6135386228561401, + "learning_rate": 3.1268582755203175e-06, + "loss": 0.9874, + "step": 506 + }, + { + "epoch": 0.9037433155080213, + "grad_norm": 1.531295895576477, + "learning_rate": 3.121902874132805e-06, + "loss": 0.9816, + "step": 507 + }, + { + "epoch": 0.9055258467023173, + "grad_norm": 1.5324310064315796, + "learning_rate": 3.1169474727452924e-06, + "loss": 0.9189, + "step": 508 + }, + { + "epoch": 0.9073083778966132, + "grad_norm": 1.8411617279052734, + "learning_rate": 3.11199207135778e-06, + "loss": 0.9744, + "step": 509 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 1.6758506298065186, + "learning_rate": 3.107036669970268e-06, + "loss": 0.9774, + "step": 510 + }, + { + "epoch": 0.9090909090909091, + "eval_loss": 1.033508539199829, + "eval_runtime": 25.9173, + "eval_samples_per_second": 38.584, + "eval_steps_per_second": 2.431, + "step": 510 + }, + { + "epoch": 0.910873440285205, + "grad_norm": 1.6758506298065186, + "learning_rate": 3.107036669970268e-06, + "loss": 0.9609, + "step": 511 + }, + { + "epoch": 0.9126559714795008, + "grad_norm": 1.6164848804473877, + "learning_rate": 3.1020812685827554e-06, + "loss": 0.9842, + "step": 512 + }, + { + "epoch": 0.9144385026737968, + "grad_norm": 1.5550708770751953, + "learning_rate": 3.097125867195243e-06, + "loss": 0.9692, + "step": 513 + }, + { + "epoch": 0.9162210338680927, + "grad_norm": 1.5243220329284668, + "learning_rate": 3.0921704658077308e-06, + "loss": 0.9618, + "step": 514 + }, + { + "epoch": 0.9180035650623886, + "grad_norm": 1.660752773284912, + "learning_rate": 3.087215064420218e-06, + "loss": 0.9727, + "step": 515 + }, + { + "epoch": 0.9197860962566845, + "grad_norm": 1.559834361076355, + "learning_rate": 3.0822596630327057e-06, + "loss": 0.9733, + "step": 516 + }, + { + "epoch": 0.9215686274509803, + "grad_norm": 1.4715766906738281, + "learning_rate": 3.0773042616451937e-06, + "loss": 0.9338, + "step": 517 + }, + { + "epoch": 0.9233511586452763, + "grad_norm": 1.4563944339752197, + "learning_rate": 3.0723488602576814e-06, + "loss": 0.9204, + "step": 518 + }, + { + "epoch": 0.9251336898395722, + "grad_norm": 1.613408088684082, + "learning_rate": 3.0673934588701686e-06, + "loss": 0.975, + "step": 519 + }, + { + "epoch": 0.9269162210338681, + "grad_norm": 1.436843752861023, + "learning_rate": 3.0624380574826563e-06, + "loss": 0.9464, + "step": 520 + }, + { + "epoch": 0.9269162210338681, + "eval_loss": 1.031677484512329, + "eval_runtime": 25.8601, + "eval_samples_per_second": 38.67, + "eval_steps_per_second": 2.436, + "step": 520 + }, + { + "epoch": 0.928698752228164, + "grad_norm": 1.4962098598480225, + "learning_rate": 3.0574826560951435e-06, + "loss": 0.9756, + "step": 521 + }, + { + "epoch": 0.93048128342246, + "grad_norm": 1.5167757272720337, + "learning_rate": 3.0525272547076316e-06, + "loss": 0.9565, + "step": 522 + }, + { + "epoch": 0.9322638146167558, + "grad_norm": 1.5992977619171143, + "learning_rate": 3.0475718533201193e-06, + "loss": 0.9752, + "step": 523 + }, + { + "epoch": 0.9340463458110517, + "grad_norm": 1.521874189376831, + "learning_rate": 3.042616451932607e-06, + "loss": 0.9424, + "step": 524 + }, + { + "epoch": 0.9358288770053476, + "grad_norm": 1.5859359502792358, + "learning_rate": 3.037661050545094e-06, + "loss": 0.966, + "step": 525 + }, + { + "epoch": 0.9376114081996435, + "grad_norm": 1.4846385717391968, + "learning_rate": 3.032705649157582e-06, + "loss": 0.97, + "step": 526 + }, + { + "epoch": 0.9393939393939394, + "grad_norm": 1.4894582033157349, + "learning_rate": 3.02775024777007e-06, + "loss": 0.9651, + "step": 527 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 1.538907766342163, + "learning_rate": 3.022794846382557e-06, + "loss": 0.9439, + "step": 528 + }, + { + "epoch": 0.9429590017825312, + "grad_norm": 1.5071743726730347, + "learning_rate": 3.017839444995045e-06, + "loss": 0.9667, + "step": 529 + }, + { + "epoch": 0.9447415329768271, + "grad_norm": 1.4718732833862305, + "learning_rate": 3.0128840436075325e-06, + "loss": 0.9553, + "step": 530 + }, + { + "epoch": 0.9447415329768271, + "eval_loss": 1.027829885482788, + "eval_runtime": 25.8768, + "eval_samples_per_second": 38.645, + "eval_steps_per_second": 2.435, + "step": 530 + }, + { + "epoch": 0.946524064171123, + "grad_norm": 1.5042873620986938, + "learning_rate": 3.0079286422200197e-06, + "loss": 0.9772, + "step": 531 + }, + { + "epoch": 0.948306595365419, + "grad_norm": 1.4991027116775513, + "learning_rate": 3.0029732408325074e-06, + "loss": 0.9549, + "step": 532 + }, + { + "epoch": 0.9500891265597148, + "grad_norm": 1.4758248329162598, + "learning_rate": 2.9980178394449955e-06, + "loss": 0.9659, + "step": 533 + }, + { + "epoch": 0.9518716577540107, + "grad_norm": 1.4332822561264038, + "learning_rate": 2.993062438057483e-06, + "loss": 0.9405, + "step": 534 + }, + { + "epoch": 0.9536541889483066, + "grad_norm": 1.4461305141448975, + "learning_rate": 2.9881070366699704e-06, + "loss": 0.9822, + "step": 535 + }, + { + "epoch": 0.9554367201426025, + "grad_norm": 1.5441949367523193, + "learning_rate": 2.983151635282458e-06, + "loss": 0.9979, + "step": 536 + }, + { + "epoch": 0.9572192513368984, + "grad_norm": 1.5105853080749512, + "learning_rate": 2.9781962338949457e-06, + "loss": 0.9608, + "step": 537 + }, + { + "epoch": 0.9590017825311943, + "grad_norm": 1.5477588176727295, + "learning_rate": 2.9732408325074334e-06, + "loss": 0.9343, + "step": 538 + }, + { + "epoch": 0.9607843137254902, + "grad_norm": 1.4892299175262451, + "learning_rate": 2.968285431119921e-06, + "loss": 0.95, + "step": 539 + }, + { + "epoch": 0.9625668449197861, + "grad_norm": 1.5129679441452026, + "learning_rate": 2.9633300297324087e-06, + "loss": 0.9194, + "step": 540 + }, + { + "epoch": 0.9625668449197861, + "eval_loss": 1.0259133577346802, + "eval_runtime": 26.1016, + "eval_samples_per_second": 38.312, + "eval_steps_per_second": 2.414, + "step": 540 + }, + { + "epoch": 0.964349376114082, + "grad_norm": 1.589005947113037, + "learning_rate": 2.958374628344896e-06, + "loss": 0.992, + "step": 541 + }, + { + "epoch": 0.966131907308378, + "grad_norm": 1.4980206489562988, + "learning_rate": 2.9534192269573836e-06, + "loss": 0.938, + "step": 542 + }, + { + "epoch": 0.9679144385026738, + "grad_norm": 1.4804502725601196, + "learning_rate": 2.9484638255698717e-06, + "loss": 0.9629, + "step": 543 + }, + { + "epoch": 0.9696969696969697, + "grad_norm": 1.4890419244766235, + "learning_rate": 2.9435084241823593e-06, + "loss": 0.9633, + "step": 544 + }, + { + "epoch": 0.9714795008912656, + "grad_norm": 1.4954791069030762, + "learning_rate": 2.9385530227948466e-06, + "loss": 0.9718, + "step": 545 + }, + { + "epoch": 0.9732620320855615, + "grad_norm": 1.5530617237091064, + "learning_rate": 2.9335976214073342e-06, + "loss": 0.972, + "step": 546 + }, + { + "epoch": 0.9750445632798574, + "grad_norm": 1.5793863534927368, + "learning_rate": 2.9286422200198215e-06, + "loss": 0.9656, + "step": 547 + }, + { + "epoch": 0.9768270944741533, + "grad_norm": 1.5591521263122559, + "learning_rate": 2.923686818632309e-06, + "loss": 0.9445, + "step": 548 + }, + { + "epoch": 0.9786096256684492, + "grad_norm": 1.7068123817443848, + "learning_rate": 2.9187314172447972e-06, + "loss": 0.9979, + "step": 549 + }, + { + "epoch": 0.9803921568627451, + "grad_norm": 1.5867973566055298, + "learning_rate": 2.913776015857285e-06, + "loss": 0.9755, + "step": 550 + }, + { + "epoch": 0.9803921568627451, + "eval_loss": 1.0254909992218018, + "eval_runtime": 25.9627, + "eval_samples_per_second": 38.517, + "eval_steps_per_second": 2.427, + "step": 550 + }, + { + "epoch": 0.982174688057041, + "grad_norm": 1.5121583938598633, + "learning_rate": 2.908820614469772e-06, + "loss": 0.9659, + "step": 551 + }, + { + "epoch": 0.983957219251337, + "grad_norm": 1.4990359544754028, + "learning_rate": 2.9038652130822598e-06, + "loss": 0.9898, + "step": 552 + }, + { + "epoch": 0.9857397504456328, + "grad_norm": 1.6708561182022095, + "learning_rate": 2.8989098116947474e-06, + "loss": 0.9674, + "step": 553 + }, + { + "epoch": 0.9875222816399287, + "grad_norm": 1.5765035152435303, + "learning_rate": 2.8939544103072355e-06, + "loss": 0.9363, + "step": 554 + }, + { + "epoch": 0.9893048128342246, + "grad_norm": 1.8125452995300293, + "learning_rate": 2.8889990089197228e-06, + "loss": 0.9785, + "step": 555 + }, + { + "epoch": 0.9910873440285205, + "grad_norm": 1.666881799697876, + "learning_rate": 2.8840436075322104e-06, + "loss": 0.9514, + "step": 556 + }, + { + "epoch": 0.9928698752228164, + "grad_norm": 1.5965781211853027, + "learning_rate": 2.8790882061446977e-06, + "loss": 0.9818, + "step": 557 + }, + { + "epoch": 0.9946524064171123, + "grad_norm": 1.6427617073059082, + "learning_rate": 2.8741328047571853e-06, + "loss": 0.9308, + "step": 558 + }, + { + "epoch": 0.9964349376114082, + "grad_norm": 1.5469664335250854, + "learning_rate": 2.8691774033696734e-06, + "loss": 1.0039, + "step": 559 + }, + { + "epoch": 0.9982174688057041, + "grad_norm": 1.4527232646942139, + "learning_rate": 2.864222001982161e-06, + "loss": 0.9536, + "step": 560 + }, + { + "epoch": 0.9982174688057041, + "eval_loss": 1.0249741077423096, + "eval_runtime": 25.9621, + "eval_samples_per_second": 38.518, + "eval_steps_per_second": 2.427, + "step": 560 + }, + { + "epoch": 1.0, + "grad_norm": 1.638670802116394, + "learning_rate": 2.8592666005946483e-06, + "loss": 0.945, + "step": 561 + }, + { + "epoch": 1.0017825311942958, + "grad_norm": 1.7704370021820068, + "learning_rate": 2.854311199207136e-06, + "loss": 0.9291, + "step": 562 + }, + { + "epoch": 1.0035650623885919, + "grad_norm": 1.534660816192627, + "learning_rate": 2.8493557978196236e-06, + "loss": 0.9013, + "step": 563 + }, + { + "epoch": 1.0053475935828877, + "grad_norm": 1.504355549812317, + "learning_rate": 2.844400396432111e-06, + "loss": 0.924, + "step": 564 + }, + { + "epoch": 1.0071301247771836, + "grad_norm": 1.7646287679672241, + "learning_rate": 2.839444995044599e-06, + "loss": 0.954, + "step": 565 + }, + { + "epoch": 1.0089126559714796, + "grad_norm": 1.835209846496582, + "learning_rate": 2.8344895936570866e-06, + "loss": 0.9199, + "step": 566 + }, + { + "epoch": 1.0106951871657754, + "grad_norm": 1.6713684797286987, + "learning_rate": 2.829534192269574e-06, + "loss": 0.9314, + "step": 567 + }, + { + "epoch": 1.0124777183600713, + "grad_norm": 1.545698642730713, + "learning_rate": 2.8245787908820615e-06, + "loss": 0.9741, + "step": 568 + }, + { + "epoch": 1.014260249554367, + "grad_norm": 1.5221408605575562, + "learning_rate": 2.819623389494549e-06, + "loss": 0.9325, + "step": 569 + }, + { + "epoch": 1.0160427807486632, + "grad_norm": 1.5692074298858643, + "learning_rate": 2.8146679881070373e-06, + "loss": 0.9593, + "step": 570 + }, + { + "epoch": 1.0160427807486632, + "eval_loss": 1.0230886936187744, + "eval_runtime": 25.9561, + "eval_samples_per_second": 38.527, + "eval_steps_per_second": 2.427, + "step": 570 + }, + { + "epoch": 1.017825311942959, + "grad_norm": 1.6565548181533813, + "learning_rate": 2.8097125867195245e-06, + "loss": 0.9367, + "step": 571 + }, + { + "epoch": 1.0196078431372548, + "grad_norm": 1.5202622413635254, + "learning_rate": 2.804757185332012e-06, + "loss": 0.9421, + "step": 572 + }, + { + "epoch": 1.0213903743315509, + "grad_norm": 1.6204488277435303, + "learning_rate": 2.7998017839445e-06, + "loss": 0.9327, + "step": 573 + }, + { + "epoch": 1.0231729055258467, + "grad_norm": 1.7548097372055054, + "learning_rate": 2.794846382556987e-06, + "loss": 0.9696, + "step": 574 + }, + { + "epoch": 1.0249554367201426, + "grad_norm": 1.6551501750946045, + "learning_rate": 2.789890981169475e-06, + "loss": 0.9255, + "step": 575 + }, + { + "epoch": 1.0267379679144386, + "grad_norm": 1.6978447437286377, + "learning_rate": 2.784935579781963e-06, + "loss": 0.9724, + "step": 576 + }, + { + "epoch": 1.0285204991087344, + "grad_norm": 1.6319726705551147, + "learning_rate": 2.77998017839445e-06, + "loss": 0.9499, + "step": 577 + }, + { + "epoch": 1.0303030303030303, + "grad_norm": 1.803403377532959, + "learning_rate": 2.7750247770069377e-06, + "loss": 0.876, + "step": 578 + }, + { + "epoch": 1.032085561497326, + "grad_norm": 1.7570103406906128, + "learning_rate": 2.7700693756194254e-06, + "loss": 0.9374, + "step": 579 + }, + { + "epoch": 1.0338680926916222, + "grad_norm": 1.6463748216629028, + "learning_rate": 2.7651139742319126e-06, + "loss": 0.9224, + "step": 580 + }, + { + "epoch": 1.0338680926916222, + "eval_loss": 1.026388168334961, + "eval_runtime": 25.8018, + "eval_samples_per_second": 38.757, + "eval_steps_per_second": 2.442, + "step": 580 + }, + { + "epoch": 1.035650623885918, + "grad_norm": 1.7040306329727173, + "learning_rate": 2.7601585728444007e-06, + "loss": 0.9659, + "step": 581 + }, + { + "epoch": 1.0374331550802138, + "grad_norm": 1.6346691846847534, + "learning_rate": 2.7552031714568884e-06, + "loss": 0.9305, + "step": 582 + }, + { + "epoch": 1.0392156862745099, + "grad_norm": 1.604477882385254, + "learning_rate": 2.750247770069376e-06, + "loss": 0.9696, + "step": 583 + }, + { + "epoch": 1.0409982174688057, + "grad_norm": 1.744341492652893, + "learning_rate": 2.7452923686818633e-06, + "loss": 0.9211, + "step": 584 + }, + { + "epoch": 1.0427807486631016, + "grad_norm": 1.7094818353652954, + "learning_rate": 2.740336967294351e-06, + "loss": 0.9603, + "step": 585 + }, + { + "epoch": 1.0445632798573976, + "grad_norm": 1.6708636283874512, + "learning_rate": 2.735381565906839e-06, + "loss": 0.9433, + "step": 586 + }, + { + "epoch": 1.0463458110516934, + "grad_norm": 1.702352523803711, + "learning_rate": 2.7304261645193263e-06, + "loss": 0.9388, + "step": 587 + }, + { + "epoch": 1.0481283422459893, + "grad_norm": 1.6145100593566895, + "learning_rate": 2.725470763131814e-06, + "loss": 0.9369, + "step": 588 + }, + { + "epoch": 1.049910873440285, + "grad_norm": 1.6174883842468262, + "learning_rate": 2.7205153617443016e-06, + "loss": 0.8993, + "step": 589 + }, + { + "epoch": 1.0516934046345812, + "grad_norm": 1.6352730989456177, + "learning_rate": 2.715559960356789e-06, + "loss": 0.9432, + "step": 590 + }, + { + "epoch": 1.0516934046345812, + "eval_loss": 1.0225163698196411, + "eval_runtime": 26.0491, + "eval_samples_per_second": 38.389, + "eval_steps_per_second": 2.419, + "step": 590 + }, + { + "epoch": 1.053475935828877, + "grad_norm": 1.5941722393035889, + "learning_rate": 2.710604558969277e-06, + "loss": 0.9628, + "step": 591 + }, + { + "epoch": 1.0552584670231728, + "grad_norm": 1.645479440689087, + "learning_rate": 2.7056491575817646e-06, + "loss": 0.9459, + "step": 592 + }, + { + "epoch": 1.0570409982174689, + "grad_norm": 1.7374376058578491, + "learning_rate": 2.700693756194252e-06, + "loss": 0.9432, + "step": 593 + }, + { + "epoch": 1.0588235294117647, + "grad_norm": 1.7686152458190918, + "learning_rate": 2.6957383548067395e-06, + "loss": 0.9301, + "step": 594 + }, + { + "epoch": 1.0606060606060606, + "grad_norm": 1.615399718284607, + "learning_rate": 2.690782953419227e-06, + "loss": 0.9238, + "step": 595 + }, + { + "epoch": 1.0623885918003566, + "grad_norm": 1.615858554840088, + "learning_rate": 2.6858275520317144e-06, + "loss": 0.904, + "step": 596 + }, + { + "epoch": 1.0641711229946524, + "grad_norm": 1.6182299852371216, + "learning_rate": 2.6808721506442025e-06, + "loss": 0.8771, + "step": 597 + }, + { + "epoch": 1.0659536541889483, + "grad_norm": 1.731257438659668, + "learning_rate": 2.67591674925669e-06, + "loss": 0.9317, + "step": 598 + }, + { + "epoch": 1.067736185383244, + "grad_norm": 1.6173752546310425, + "learning_rate": 2.6709613478691778e-06, + "loss": 0.9584, + "step": 599 + }, + { + "epoch": 1.0695187165775402, + "grad_norm": 1.714568853378296, + "learning_rate": 2.666005946481665e-06, + "loss": 0.9318, + "step": 600 + }, + { + "epoch": 1.0695187165775402, + "eval_loss": 1.0215412378311157, + "eval_runtime": 25.9358, + "eval_samples_per_second": 38.557, + "eval_steps_per_second": 2.429, + "step": 600 + }, + { + "epoch": 1.071301247771836, + "grad_norm": 1.6474616527557373, + "learning_rate": 2.6610505450941527e-06, + "loss": 0.9121, + "step": 601 + }, + { + "epoch": 1.0730837789661318, + "grad_norm": 1.569778323173523, + "learning_rate": 2.6560951437066408e-06, + "loss": 0.9517, + "step": 602 + }, + { + "epoch": 1.0748663101604279, + "grad_norm": 1.6712771654129028, + "learning_rate": 2.651139742319128e-06, + "loss": 0.9473, + "step": 603 + }, + { + "epoch": 1.0766488413547237, + "grad_norm": 1.657092571258545, + "learning_rate": 2.6461843409316157e-06, + "loss": 0.9295, + "step": 604 + }, + { + "epoch": 1.0784313725490196, + "grad_norm": 1.5856530666351318, + "learning_rate": 2.6412289395441033e-06, + "loss": 0.9323, + "step": 605 + }, + { + "epoch": 1.0802139037433156, + "grad_norm": 1.6494325399398804, + "learning_rate": 2.6362735381565906e-06, + "loss": 0.9229, + "step": 606 + }, + { + "epoch": 1.0819964349376114, + "grad_norm": 1.6499768495559692, + "learning_rate": 2.6313181367690786e-06, + "loss": 0.9304, + "step": 607 + }, + { + "epoch": 1.0837789661319073, + "grad_norm": 1.6964021921157837, + "learning_rate": 2.6263627353815663e-06, + "loss": 0.9338, + "step": 608 + }, + { + "epoch": 1.085561497326203, + "grad_norm": 1.6379368305206299, + "learning_rate": 2.621407333994054e-06, + "loss": 0.9413, + "step": 609 + }, + { + "epoch": 1.0873440285204992, + "grad_norm": 1.690470576286316, + "learning_rate": 2.616451932606541e-06, + "loss": 0.9529, + "step": 610 + }, + { + "epoch": 1.0873440285204992, + "eval_loss": 1.0203608274459839, + "eval_runtime": 25.7541, + "eval_samples_per_second": 38.829, + "eval_steps_per_second": 2.446, + "step": 610 + }, + { + "epoch": 1.089126559714795, + "grad_norm": 1.669008731842041, + "learning_rate": 2.611496531219029e-06, + "loss": 0.915, + "step": 611 + }, + { + "epoch": 1.0909090909090908, + "grad_norm": 1.6885886192321777, + "learning_rate": 2.606541129831516e-06, + "loss": 0.9694, + "step": 612 + }, + { + "epoch": 1.0926916221033869, + "grad_norm": 1.6819870471954346, + "learning_rate": 2.601585728444004e-06, + "loss": 0.894, + "step": 613 + }, + { + "epoch": 1.0944741532976827, + "grad_norm": 1.686949372291565, + "learning_rate": 2.596630327056492e-06, + "loss": 0.946, + "step": 614 + }, + { + "epoch": 1.0962566844919786, + "grad_norm": 1.7434266805648804, + "learning_rate": 2.5916749256689795e-06, + "loss": 0.9535, + "step": 615 + }, + { + "epoch": 1.0980392156862746, + "grad_norm": 1.6421117782592773, + "learning_rate": 2.5867195242814668e-06, + "loss": 0.9247, + "step": 616 + }, + { + "epoch": 1.0998217468805704, + "grad_norm": 1.7280988693237305, + "learning_rate": 2.5817641228939544e-06, + "loss": 0.9098, + "step": 617 + }, + { + "epoch": 1.1016042780748663, + "grad_norm": 1.6625720262527466, + "learning_rate": 2.5768087215064425e-06, + "loss": 0.916, + "step": 618 + }, + { + "epoch": 1.1033868092691623, + "grad_norm": 1.6401927471160889, + "learning_rate": 2.57185332011893e-06, + "loss": 0.9642, + "step": 619 + }, + { + "epoch": 1.1051693404634582, + "grad_norm": 1.6819369792938232, + "learning_rate": 2.5668979187314174e-06, + "loss": 0.9489, + "step": 620 + }, + { + "epoch": 1.1051693404634582, + "eval_loss": 1.0213559865951538, + "eval_runtime": 25.855, + "eval_samples_per_second": 38.677, + "eval_steps_per_second": 2.437, + "step": 620 + }, + { + "epoch": 1.106951871657754, + "grad_norm": 1.6465580463409424, + "learning_rate": 2.561942517343905e-06, + "loss": 0.936, + "step": 621 + }, + { + "epoch": 1.1087344028520498, + "grad_norm": 1.751806616783142, + "learning_rate": 2.5569871159563923e-06, + "loss": 0.9177, + "step": 622 + }, + { + "epoch": 1.1105169340463459, + "grad_norm": 1.6855638027191162, + "learning_rate": 2.5520317145688804e-06, + "loss": 0.973, + "step": 623 + }, + { + "epoch": 1.1122994652406417, + "grad_norm": 1.575648546218872, + "learning_rate": 2.547076313181368e-06, + "loss": 0.9475, + "step": 624 + }, + { + "epoch": 1.1140819964349375, + "grad_norm": 1.7097973823547363, + "learning_rate": 2.5421209117938557e-06, + "loss": 0.941, + "step": 625 + }, + { + "epoch": 1.1158645276292336, + "grad_norm": 1.6338120698928833, + "learning_rate": 2.537165510406343e-06, + "loss": 1.003, + "step": 626 + }, + { + "epoch": 1.1176470588235294, + "grad_norm": 1.6847730875015259, + "learning_rate": 2.5322101090188306e-06, + "loss": 0.9165, + "step": 627 + }, + { + "epoch": 1.1194295900178253, + "grad_norm": 1.5890871286392212, + "learning_rate": 2.5272547076313183e-06, + "loss": 0.9363, + "step": 628 + }, + { + "epoch": 1.121212121212121, + "grad_norm": 1.5626202821731567, + "learning_rate": 2.522299306243806e-06, + "loss": 0.9601, + "step": 629 + }, + { + "epoch": 1.1229946524064172, + "grad_norm": 1.5947809219360352, + "learning_rate": 2.5173439048562936e-06, + "loss": 0.9332, + "step": 630 + }, + { + "epoch": 1.1229946524064172, + "eval_loss": 1.0196335315704346, + "eval_runtime": 25.97, + "eval_samples_per_second": 38.506, + "eval_steps_per_second": 2.426, + "step": 630 + }, + { + "epoch": 1.124777183600713, + "grad_norm": 1.5689764022827148, + "learning_rate": 2.5123885034687813e-06, + "loss": 0.9045, + "step": 631 + }, + { + "epoch": 1.1265597147950088, + "grad_norm": 1.680080771446228, + "learning_rate": 2.5074331020812685e-06, + "loss": 0.9581, + "step": 632 + }, + { + "epoch": 1.1283422459893049, + "grad_norm": 1.6683106422424316, + "learning_rate": 2.502477700693756e-06, + "loss": 0.9295, + "step": 633 + }, + { + "epoch": 1.1301247771836007, + "grad_norm": 1.6343003511428833, + "learning_rate": 2.497522299306244e-06, + "loss": 0.9312, + "step": 634 + }, + { + "epoch": 1.1319073083778965, + "grad_norm": 1.6656992435455322, + "learning_rate": 2.492566897918732e-06, + "loss": 0.9265, + "step": 635 + }, + { + "epoch": 1.1336898395721926, + "grad_norm": 1.7076340913772583, + "learning_rate": 2.487611496531219e-06, + "loss": 0.9205, + "step": 636 + }, + { + "epoch": 1.1354723707664884, + "grad_norm": 1.6243927478790283, + "learning_rate": 2.482656095143707e-06, + "loss": 0.9429, + "step": 637 + }, + { + "epoch": 1.1372549019607843, + "grad_norm": 1.6368482112884521, + "learning_rate": 2.4777006937561945e-06, + "loss": 0.9155, + "step": 638 + }, + { + "epoch": 1.1390374331550803, + "grad_norm": 1.5730035305023193, + "learning_rate": 2.472745292368682e-06, + "loss": 0.9415, + "step": 639 + }, + { + "epoch": 1.1408199643493762, + "grad_norm": 1.5617438554763794, + "learning_rate": 2.4677898909811694e-06, + "loss": 0.9295, + "step": 640 + }, + { + "epoch": 1.1408199643493762, + "eval_loss": 1.0185329914093018, + "eval_runtime": 25.9777, + "eval_samples_per_second": 38.495, + "eval_steps_per_second": 2.425, + "step": 640 + }, + { + "epoch": 1.142602495543672, + "grad_norm": 1.5681148767471313, + "learning_rate": 2.4628344895936575e-06, + "loss": 0.9041, + "step": 641 + }, + { + "epoch": 1.1443850267379678, + "grad_norm": 1.6187138557434082, + "learning_rate": 2.4578790882061447e-06, + "loss": 0.9238, + "step": 642 + }, + { + "epoch": 1.1461675579322639, + "grad_norm": 1.5519737005233765, + "learning_rate": 2.4529236868186328e-06, + "loss": 0.922, + "step": 643 + }, + { + "epoch": 1.1479500891265597, + "grad_norm": 1.7167925834655762, + "learning_rate": 2.44796828543112e-06, + "loss": 0.9255, + "step": 644 + }, + { + "epoch": 1.1497326203208555, + "grad_norm": 1.6633977890014648, + "learning_rate": 2.4430128840436077e-06, + "loss": 0.9498, + "step": 645 + }, + { + "epoch": 1.1515151515151516, + "grad_norm": 1.593684196472168, + "learning_rate": 2.4380574826560953e-06, + "loss": 0.903, + "step": 646 + }, + { + "epoch": 1.1532976827094474, + "grad_norm": 1.733375906944275, + "learning_rate": 2.433102081268583e-06, + "loss": 0.9358, + "step": 647 + }, + { + "epoch": 1.1550802139037433, + "grad_norm": 1.6569396257400513, + "learning_rate": 2.4281466798810702e-06, + "loss": 0.9104, + "step": 648 + }, + { + "epoch": 1.156862745098039, + "grad_norm": 1.7154539823532104, + "learning_rate": 2.4231912784935583e-06, + "loss": 0.922, + "step": 649 + }, + { + "epoch": 1.1586452762923352, + "grad_norm": 1.6517035961151123, + "learning_rate": 2.4182358771060456e-06, + "loss": 0.969, + "step": 650 + }, + { + "epoch": 1.1586452762923352, + "eval_loss": 1.020390272140503, + "eval_runtime": 26.0598, + "eval_samples_per_second": 38.373, + "eval_steps_per_second": 2.418, + "step": 650 + }, + { + "epoch": 1.160427807486631, + "grad_norm": 1.5234638452529907, + "learning_rate": 2.4132804757185337e-06, + "loss": 0.8913, + "step": 651 + }, + { + "epoch": 1.1622103386809268, + "grad_norm": 1.697643756866455, + "learning_rate": 2.408325074331021e-06, + "loss": 0.9231, + "step": 652 + }, + { + "epoch": 1.1639928698752229, + "grad_norm": 1.6353060007095337, + "learning_rate": 2.4033696729435086e-06, + "loss": 0.9438, + "step": 653 + }, + { + "epoch": 1.1657754010695187, + "grad_norm": 1.615633487701416, + "learning_rate": 2.3984142715559962e-06, + "loss": 0.9352, + "step": 654 + }, + { + "epoch": 1.1675579322638145, + "grad_norm": 1.6155204772949219, + "learning_rate": 2.393458870168484e-06, + "loss": 0.9843, + "step": 655 + }, + { + "epoch": 1.1693404634581106, + "grad_norm": 1.696890115737915, + "learning_rate": 2.3885034687809715e-06, + "loss": 0.888, + "step": 656 + }, + { + "epoch": 1.1711229946524064, + "grad_norm": 1.7331446409225464, + "learning_rate": 2.383548067393459e-06, + "loss": 0.9122, + "step": 657 + }, + { + "epoch": 1.1729055258467023, + "grad_norm": 1.7322742938995361, + "learning_rate": 2.3785926660059464e-06, + "loss": 0.9233, + "step": 658 + }, + { + "epoch": 1.1746880570409983, + "grad_norm": 1.7611942291259766, + "learning_rate": 2.3736372646184345e-06, + "loss": 0.9364, + "step": 659 + }, + { + "epoch": 1.1764705882352942, + "grad_norm": 1.6479074954986572, + "learning_rate": 2.3686818632309218e-06, + "loss": 0.9462, + "step": 660 + }, + { + "epoch": 1.1764705882352942, + "eval_loss": 1.0187642574310303, + "eval_runtime": 25.8143, + "eval_samples_per_second": 38.738, + "eval_steps_per_second": 2.441, + "step": 660 + }, + { + "epoch": 1.17825311942959, + "grad_norm": 1.7428786754608154, + "learning_rate": 2.3637264618434094e-06, + "loss": 0.9287, + "step": 661 + }, + { + "epoch": 1.1800356506238858, + "grad_norm": 1.680544376373291, + "learning_rate": 2.358771060455897e-06, + "loss": 0.916, + "step": 662 + }, + { + "epoch": 1.1818181818181819, + "grad_norm": 1.5923148393630981, + "learning_rate": 2.3538156590683847e-06, + "loss": 0.945, + "step": 663 + }, + { + "epoch": 1.1836007130124777, + "grad_norm": 1.616050124168396, + "learning_rate": 2.3488602576808724e-06, + "loss": 0.9443, + "step": 664 + }, + { + "epoch": 1.1853832442067735, + "grad_norm": 1.5984090566635132, + "learning_rate": 2.34390485629336e-06, + "loss": 0.9196, + "step": 665 + }, + { + "epoch": 1.1871657754010696, + "grad_norm": 1.611007571220398, + "learning_rate": 2.3389494549058473e-06, + "loss": 0.9511, + "step": 666 + }, + { + "epoch": 1.1889483065953654, + "grad_norm": 1.755518913269043, + "learning_rate": 2.3339940535183354e-06, + "loss": 0.9542, + "step": 667 + }, + { + "epoch": 1.1907308377896613, + "grad_norm": 1.6217259168624878, + "learning_rate": 2.3290386521308226e-06, + "loss": 0.9308, + "step": 668 + }, + { + "epoch": 1.192513368983957, + "grad_norm": 1.6171029806137085, + "learning_rate": 2.3240832507433103e-06, + "loss": 0.9602, + "step": 669 + }, + { + "epoch": 1.1942959001782532, + "grad_norm": 1.581703543663025, + "learning_rate": 2.319127849355798e-06, + "loss": 0.9284, + "step": 670 + }, + { + "epoch": 1.1942959001782532, + "eval_loss": 1.0156581401824951, + "eval_runtime": 25.9436, + "eval_samples_per_second": 38.545, + "eval_steps_per_second": 2.428, + "step": 670 + }, + { + "epoch": 1.196078431372549, + "grad_norm": 1.608201026916504, + "learning_rate": 2.3141724479682856e-06, + "loss": 0.9226, + "step": 671 + }, + { + "epoch": 1.1978609625668448, + "grad_norm": 1.6719226837158203, + "learning_rate": 2.3092170465807733e-06, + "loss": 0.9547, + "step": 672 + }, + { + "epoch": 1.1996434937611409, + "grad_norm": 1.570652723312378, + "learning_rate": 2.304261645193261e-06, + "loss": 0.9306, + "step": 673 + }, + { + "epoch": 1.2014260249554367, + "grad_norm": 1.6573023796081543, + "learning_rate": 2.2993062438057486e-06, + "loss": 0.9672, + "step": 674 + }, + { + "epoch": 1.2032085561497325, + "grad_norm": 1.7345811128616333, + "learning_rate": 2.2943508424182363e-06, + "loss": 0.9166, + "step": 675 + }, + { + "epoch": 1.2049910873440286, + "grad_norm": 1.6902786493301392, + "learning_rate": 2.2893954410307235e-06, + "loss": 0.9546, + "step": 676 + }, + { + "epoch": 1.2067736185383244, + "grad_norm": 1.7392154932022095, + "learning_rate": 2.284440039643211e-06, + "loss": 0.9053, + "step": 677 + }, + { + "epoch": 1.2085561497326203, + "grad_norm": 1.5849647521972656, + "learning_rate": 2.279484638255699e-06, + "loss": 0.9533, + "step": 678 + }, + { + "epoch": 1.2103386809269163, + "grad_norm": 1.638115406036377, + "learning_rate": 2.2745292368681865e-06, + "loss": 0.9489, + "step": 679 + }, + { + "epoch": 1.2121212121212122, + "grad_norm": 1.6124337911605835, + "learning_rate": 2.269573835480674e-06, + "loss": 0.9775, + "step": 680 + }, + { + "epoch": 1.2121212121212122, + "eval_loss": 1.0148694515228271, + "eval_runtime": 25.8646, + "eval_samples_per_second": 38.663, + "eval_steps_per_second": 2.436, + "step": 680 + }, + { + "epoch": 1.213903743315508, + "grad_norm": 1.5980111360549927, + "learning_rate": 2.264618434093162e-06, + "loss": 0.9381, + "step": 681 + }, + { + "epoch": 1.215686274509804, + "grad_norm": 1.6980215311050415, + "learning_rate": 2.2596630327056495e-06, + "loss": 0.9105, + "step": 682 + }, + { + "epoch": 1.2174688057040999, + "grad_norm": 1.595822811126709, + "learning_rate": 2.254707631318137e-06, + "loss": 0.9326, + "step": 683 + }, + { + "epoch": 1.2192513368983957, + "grad_norm": 1.6184011697769165, + "learning_rate": 2.2497522299306244e-06, + "loss": 0.9102, + "step": 684 + }, + { + "epoch": 1.2210338680926915, + "grad_norm": 1.6323550939559937, + "learning_rate": 2.244796828543112e-06, + "loss": 0.937, + "step": 685 + }, + { + "epoch": 1.2228163992869876, + "grad_norm": 1.7325725555419922, + "learning_rate": 2.2398414271555997e-06, + "loss": 0.914, + "step": 686 + }, + { + "epoch": 1.2245989304812834, + "grad_norm": 1.7137166261672974, + "learning_rate": 2.2348860257680874e-06, + "loss": 0.9083, + "step": 687 + }, + { + "epoch": 1.2263814616755793, + "grad_norm": 1.6467852592468262, + "learning_rate": 2.229930624380575e-06, + "loss": 0.9091, + "step": 688 + }, + { + "epoch": 1.228163992869875, + "grad_norm": 1.683382272720337, + "learning_rate": 2.2249752229930627e-06, + "loss": 0.9592, + "step": 689 + }, + { + "epoch": 1.2299465240641712, + "grad_norm": 1.6149805784225464, + "learning_rate": 2.2200198216055503e-06, + "loss": 0.9362, + "step": 690 + }, + { + "epoch": 1.2299465240641712, + "eval_loss": 1.0140960216522217, + "eval_runtime": 25.8555, + "eval_samples_per_second": 38.676, + "eval_steps_per_second": 2.437, + "step": 690 + }, + { + "epoch": 1.231729055258467, + "grad_norm": 1.8765431642532349, + "learning_rate": 2.215064420218038e-06, + "loss": 0.9462, + "step": 691 + }, + { + "epoch": 1.2335115864527628, + "grad_norm": 1.7470018863677979, + "learning_rate": 2.2101090188305257e-06, + "loss": 0.9459, + "step": 692 + }, + { + "epoch": 1.2352941176470589, + "grad_norm": 1.7016042470932007, + "learning_rate": 2.205153617443013e-06, + "loss": 0.9107, + "step": 693 + }, + { + "epoch": 1.2370766488413547, + "grad_norm": 1.6355701684951782, + "learning_rate": 2.2001982160555006e-06, + "loss": 0.9052, + "step": 694 + }, + { + "epoch": 1.2388591800356505, + "grad_norm": 1.7940343618392944, + "learning_rate": 2.1952428146679882e-06, + "loss": 0.9227, + "step": 695 + }, + { + "epoch": 1.2406417112299466, + "grad_norm": 1.992951512336731, + "learning_rate": 2.190287413280476e-06, + "loss": 0.9405, + "step": 696 + }, + { + "epoch": 1.2424242424242424, + "grad_norm": 1.83491849899292, + "learning_rate": 2.1853320118929636e-06, + "loss": 0.9517, + "step": 697 + }, + { + "epoch": 1.2442067736185383, + "grad_norm": 1.682455062866211, + "learning_rate": 2.1803766105054512e-06, + "loss": 0.903, + "step": 698 + }, + { + "epoch": 1.2459893048128343, + "grad_norm": 1.6958837509155273, + "learning_rate": 2.175421209117939e-06, + "loss": 0.9456, + "step": 699 + }, + { + "epoch": 1.2477718360071302, + "grad_norm": 1.7611721754074097, + "learning_rate": 2.1704658077304265e-06, + "loss": 0.9759, + "step": 700 + }, + { + "epoch": 1.2477718360071302, + "eval_loss": 1.0106265544891357, + "eval_runtime": 26.0725, + "eval_samples_per_second": 38.355, + "eval_steps_per_second": 2.416, + "step": 700 + }, + { + "epoch": 1.249554367201426, + "grad_norm": 1.5676270723342896, + "learning_rate": 2.1655104063429138e-06, + "loss": 0.9711, + "step": 701 + }, + { + "epoch": 1.251336898395722, + "grad_norm": 1.7543623447418213, + "learning_rate": 2.160555004955402e-06, + "loss": 0.919, + "step": 702 + }, + { + "epoch": 1.2531194295900179, + "grad_norm": 1.6871825456619263, + "learning_rate": 2.155599603567889e-06, + "loss": 0.9047, + "step": 703 + }, + { + "epoch": 1.2549019607843137, + "grad_norm": 1.7749131917953491, + "learning_rate": 2.1506442021803768e-06, + "loss": 0.9395, + "step": 704 + }, + { + "epoch": 1.2566844919786098, + "grad_norm": 1.681713342666626, + "learning_rate": 2.1456888007928644e-06, + "loss": 0.9127, + "step": 705 + }, + { + "epoch": 1.2584670231729056, + "grad_norm": 1.762078881263733, + "learning_rate": 2.140733399405352e-06, + "loss": 0.9399, + "step": 706 + }, + { + "epoch": 1.2602495543672014, + "grad_norm": 1.7800824642181396, + "learning_rate": 2.1357779980178398e-06, + "loss": 0.93, + "step": 707 + }, + { + "epoch": 1.2620320855614973, + "grad_norm": 1.7915018796920776, + "learning_rate": 2.1308225966303274e-06, + "loss": 0.968, + "step": 708 + }, + { + "epoch": 1.263814616755793, + "grad_norm": 1.7534856796264648, + "learning_rate": 2.1258671952428147e-06, + "loss": 0.9223, + "step": 709 + }, + { + "epoch": 1.2655971479500892, + "grad_norm": 1.744184136390686, + "learning_rate": 2.1209117938553027e-06, + "loss": 0.9232, + "step": 710 + }, + { + "epoch": 1.2655971479500892, + "eval_loss": 1.0113749504089355, + "eval_runtime": 25.7648, + "eval_samples_per_second": 38.813, + "eval_steps_per_second": 2.445, + "step": 710 + }, + { + "epoch": 1.267379679144385, + "grad_norm": 1.5799130201339722, + "learning_rate": 2.11595639246779e-06, + "loss": 0.9227, + "step": 711 + }, + { + "epoch": 1.2691622103386808, + "grad_norm": 1.7383415699005127, + "learning_rate": 2.1110009910802776e-06, + "loss": 0.9304, + "step": 712 + }, + { + "epoch": 1.2709447415329769, + "grad_norm": 1.7710599899291992, + "learning_rate": 2.1060455896927653e-06, + "loss": 0.9381, + "step": 713 + }, + { + "epoch": 1.2727272727272727, + "grad_norm": 1.5871422290802002, + "learning_rate": 2.101090188305253e-06, + "loss": 0.9352, + "step": 714 + }, + { + "epoch": 1.2745098039215685, + "grad_norm": 1.674737572669983, + "learning_rate": 2.0961347869177406e-06, + "loss": 0.9646, + "step": 715 + }, + { + "epoch": 1.2762923351158646, + "grad_norm": 1.628126859664917, + "learning_rate": 2.0911793855302283e-06, + "loss": 0.9404, + "step": 716 + }, + { + "epoch": 1.2780748663101604, + "grad_norm": 1.7360297441482544, + "learning_rate": 2.0862239841427155e-06, + "loss": 0.9211, + "step": 717 + }, + { + "epoch": 1.2798573975044563, + "grad_norm": 1.6000936031341553, + "learning_rate": 2.0812685827552036e-06, + "loss": 0.9307, + "step": 718 + }, + { + "epoch": 1.2816399286987523, + "grad_norm": 1.631537675857544, + "learning_rate": 2.076313181367691e-06, + "loss": 0.9143, + "step": 719 + }, + { + "epoch": 1.2834224598930482, + "grad_norm": 1.8007129430770874, + "learning_rate": 2.0713577799801785e-06, + "loss": 0.9377, + "step": 720 + }, + { + "epoch": 1.2834224598930482, + "eval_loss": 1.0131229162216187, + "eval_runtime": 25.8685, + "eval_samples_per_second": 38.657, + "eval_steps_per_second": 2.435, + "step": 720 + }, + { + "epoch": 1.285204991087344, + "grad_norm": 1.8879586458206177, + "learning_rate": 2.066402378592666e-06, + "loss": 0.9399, + "step": 721 + }, + { + "epoch": 1.28698752228164, + "grad_norm": 1.6320232152938843, + "learning_rate": 2.061446977205154e-06, + "loss": 0.9223, + "step": 722 + }, + { + "epoch": 1.2887700534759359, + "grad_norm": 1.6035704612731934, + "learning_rate": 2.0564915758176415e-06, + "loss": 0.9243, + "step": 723 + }, + { + "epoch": 1.2905525846702317, + "grad_norm": 1.6737124919891357, + "learning_rate": 2.051536174430129e-06, + "loss": 0.8922, + "step": 724 + }, + { + "epoch": 1.2923351158645278, + "grad_norm": 1.7025960683822632, + "learning_rate": 2.0465807730426164e-06, + "loss": 0.9059, + "step": 725 + }, + { + "epoch": 1.2941176470588236, + "grad_norm": 1.6543670892715454, + "learning_rate": 2.0416253716551045e-06, + "loss": 0.9196, + "step": 726 + }, + { + "epoch": 1.2959001782531194, + "grad_norm": 1.604996919631958, + "learning_rate": 2.0366699702675917e-06, + "loss": 0.91, + "step": 727 + }, + { + "epoch": 1.2976827094474153, + "grad_norm": 1.603367567062378, + "learning_rate": 2.0317145688800794e-06, + "loss": 0.9169, + "step": 728 + }, + { + "epoch": 1.299465240641711, + "grad_norm": 1.6768299341201782, + "learning_rate": 2.026759167492567e-06, + "loss": 0.9746, + "step": 729 + }, + { + "epoch": 1.3012477718360071, + "grad_norm": 1.7131927013397217, + "learning_rate": 2.0218037661050547e-06, + "loss": 0.9352, + "step": 730 + }, + { + "epoch": 1.3012477718360071, + "eval_loss": 1.0110057592391968, + "eval_runtime": 25.8117, + "eval_samples_per_second": 38.742, + "eval_steps_per_second": 2.441, + "step": 730 + }, + { + "epoch": 1.303030303030303, + "grad_norm": 1.7616232633590698, + "learning_rate": 2.0168483647175424e-06, + "loss": 0.9446, + "step": 731 + }, + { + "epoch": 1.3048128342245988, + "grad_norm": 1.6162147521972656, + "learning_rate": 2.01189296333003e-06, + "loss": 0.9045, + "step": 732 + }, + { + "epoch": 1.3065953654188949, + "grad_norm": 1.6433504819869995, + "learning_rate": 2.0069375619425173e-06, + "loss": 0.9129, + "step": 733 + }, + { + "epoch": 1.3083778966131907, + "grad_norm": 1.5753357410430908, + "learning_rate": 2.0019821605550054e-06, + "loss": 0.9225, + "step": 734 + }, + { + "epoch": 1.3101604278074865, + "grad_norm": 1.5571519136428833, + "learning_rate": 1.9970267591674926e-06, + "loss": 0.911, + "step": 735 + }, + { + "epoch": 1.3119429590017826, + "grad_norm": 1.5970759391784668, + "learning_rate": 1.9920713577799803e-06, + "loss": 0.9415, + "step": 736 + }, + { + "epoch": 1.3137254901960784, + "grad_norm": 1.6597561836242676, + "learning_rate": 1.987115956392468e-06, + "loss": 0.9174, + "step": 737 + }, + { + "epoch": 1.3155080213903743, + "grad_norm": 1.6476688385009766, + "learning_rate": 1.9821605550049556e-06, + "loss": 0.9412, + "step": 738 + }, + { + "epoch": 1.3172905525846703, + "grad_norm": 1.6584250926971436, + "learning_rate": 1.9772051536174432e-06, + "loss": 0.8864, + "step": 739 + }, + { + "epoch": 1.3190730837789661, + "grad_norm": 1.5933711528778076, + "learning_rate": 1.972249752229931e-06, + "loss": 0.9246, + "step": 740 + }, + { + "epoch": 1.3190730837789661, + "eval_loss": 1.0107917785644531, + "eval_runtime": 25.8955, + "eval_samples_per_second": 38.617, + "eval_steps_per_second": 2.433, + "step": 740 + }, + { + "epoch": 1.320855614973262, + "grad_norm": 1.6883563995361328, + "learning_rate": 1.967294350842418e-06, + "loss": 0.936, + "step": 741 + }, + { + "epoch": 1.322638146167558, + "grad_norm": 1.5187584161758423, + "learning_rate": 1.9623389494549062e-06, + "loss": 0.878, + "step": 742 + }, + { + "epoch": 1.3244206773618539, + "grad_norm": 1.6593390703201294, + "learning_rate": 1.9573835480673935e-06, + "loss": 0.8939, + "step": 743 + }, + { + "epoch": 1.3262032085561497, + "grad_norm": 1.630261778831482, + "learning_rate": 1.952428146679881e-06, + "loss": 0.9111, + "step": 744 + }, + { + "epoch": 1.3279857397504458, + "grad_norm": 1.6974488496780396, + "learning_rate": 1.9474727452923688e-06, + "loss": 0.941, + "step": 745 + }, + { + "epoch": 1.3297682709447416, + "grad_norm": 1.6816980838775635, + "learning_rate": 1.9425173439048564e-06, + "loss": 0.934, + "step": 746 + }, + { + "epoch": 1.3315508021390374, + "grad_norm": 1.6839442253112793, + "learning_rate": 1.937561942517344e-06, + "loss": 0.9156, + "step": 747 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 1.6905910968780518, + "learning_rate": 1.9326065411298318e-06, + "loss": 0.8929, + "step": 748 + }, + { + "epoch": 1.3351158645276293, + "grad_norm": 1.6082037687301636, + "learning_rate": 1.927651139742319e-06, + "loss": 0.8903, + "step": 749 + }, + { + "epoch": 1.3368983957219251, + "grad_norm": 1.6515074968338013, + "learning_rate": 1.922695738354807e-06, + "loss": 0.9105, + "step": 750 + }, + { + "epoch": 1.3368983957219251, + "eval_loss": 1.0095351934432983, + "eval_runtime": 25.9946, + "eval_samples_per_second": 38.47, + "eval_steps_per_second": 2.424, + "step": 750 + }, + { + "epoch": 1.338680926916221, + "grad_norm": 1.6954472064971924, + "learning_rate": 1.9177403369672943e-06, + "loss": 0.9011, + "step": 751 + }, + { + "epoch": 1.3404634581105168, + "grad_norm": 1.7742551565170288, + "learning_rate": 1.912784935579782e-06, + "loss": 0.931, + "step": 752 + }, + { + "epoch": 1.3422459893048129, + "grad_norm": 1.6957601308822632, + "learning_rate": 1.9078295341922697e-06, + "loss": 0.8868, + "step": 753 + }, + { + "epoch": 1.3440285204991087, + "grad_norm": 1.6613423824310303, + "learning_rate": 1.9028741328047573e-06, + "loss": 0.9428, + "step": 754 + }, + { + "epoch": 1.3458110516934045, + "grad_norm": 1.7479135990142822, + "learning_rate": 1.897918731417245e-06, + "loss": 0.9423, + "step": 755 + }, + { + "epoch": 1.3475935828877006, + "grad_norm": 1.7856110334396362, + "learning_rate": 1.8929633300297326e-06, + "loss": 0.9013, + "step": 756 + }, + { + "epoch": 1.3493761140819964, + "grad_norm": 1.7142354249954224, + "learning_rate": 1.88800792864222e-06, + "loss": 0.9013, + "step": 757 + }, + { + "epoch": 1.3511586452762923, + "grad_norm": 1.671119213104248, + "learning_rate": 1.883052527254708e-06, + "loss": 0.9154, + "step": 758 + }, + { + "epoch": 1.3529411764705883, + "grad_norm": 1.576013207435608, + "learning_rate": 1.8780971258671954e-06, + "loss": 0.9175, + "step": 759 + }, + { + "epoch": 1.3547237076648841, + "grad_norm": 1.6630672216415405, + "learning_rate": 1.8731417244796829e-06, + "loss": 0.9157, + "step": 760 + }, + { + "epoch": 1.3547237076648841, + "eval_loss": 1.009709358215332, + "eval_runtime": 25.8704, + "eval_samples_per_second": 38.654, + "eval_steps_per_second": 2.435, + "step": 760 + }, + { + "epoch": 1.35650623885918, + "grad_norm": 1.6932988166809082, + "learning_rate": 1.8681863230921707e-06, + "loss": 0.914, + "step": 761 + }, + { + "epoch": 1.358288770053476, + "grad_norm": 1.6010832786560059, + "learning_rate": 1.8632309217046582e-06, + "loss": 0.9275, + "step": 762 + }, + { + "epoch": 1.3600713012477719, + "grad_norm": 1.722975730895996, + "learning_rate": 1.858275520317146e-06, + "loss": 0.9304, + "step": 763 + }, + { + "epoch": 1.3618538324420677, + "grad_norm": 1.6667062044143677, + "learning_rate": 1.8533201189296335e-06, + "loss": 0.9618, + "step": 764 + }, + { + "epoch": 1.3636363636363638, + "grad_norm": 1.6722793579101562, + "learning_rate": 1.848364717542121e-06, + "loss": 0.9219, + "step": 765 + }, + { + "epoch": 1.3654188948306596, + "grad_norm": 1.6280299425125122, + "learning_rate": 1.8434093161546088e-06, + "loss": 0.8931, + "step": 766 + }, + { + "epoch": 1.3672014260249554, + "grad_norm": 1.6584783792495728, + "learning_rate": 1.8384539147670963e-06, + "loss": 0.9327, + "step": 767 + }, + { + "epoch": 1.3689839572192513, + "grad_norm": 1.8174241781234741, + "learning_rate": 1.8334985133795837e-06, + "loss": 0.9375, + "step": 768 + }, + { + "epoch": 1.3707664884135473, + "grad_norm": 1.8208683729171753, + "learning_rate": 1.8285431119920716e-06, + "loss": 0.9005, + "step": 769 + }, + { + "epoch": 1.3725490196078431, + "grad_norm": 1.6982202529907227, + "learning_rate": 1.823587710604559e-06, + "loss": 0.9188, + "step": 770 + }, + { + "epoch": 1.3725490196078431, + "eval_loss": 1.0106472969055176, + "eval_runtime": 25.86, + "eval_samples_per_second": 38.67, + "eval_steps_per_second": 2.436, + "step": 770 + }, + { + "epoch": 1.374331550802139, + "grad_norm": 1.5650068521499634, + "learning_rate": 1.818632309217047e-06, + "loss": 0.9318, + "step": 771 + }, + { + "epoch": 1.3761140819964348, + "grad_norm": 1.6363683938980103, + "learning_rate": 1.8136769078295344e-06, + "loss": 0.9317, + "step": 772 + }, + { + "epoch": 1.3778966131907309, + "grad_norm": 1.6135826110839844, + "learning_rate": 1.8087215064420218e-06, + "loss": 0.896, + "step": 773 + }, + { + "epoch": 1.3796791443850267, + "grad_norm": 1.6176116466522217, + "learning_rate": 1.8037661050545097e-06, + "loss": 0.8821, + "step": 774 + }, + { + "epoch": 1.3814616755793225, + "grad_norm": 1.7548078298568726, + "learning_rate": 1.7988107036669972e-06, + "loss": 0.9173, + "step": 775 + }, + { + "epoch": 1.3832442067736186, + "grad_norm": 1.79991614818573, + "learning_rate": 1.7938553022794846e-06, + "loss": 0.9288, + "step": 776 + }, + { + "epoch": 1.3850267379679144, + "grad_norm": 1.747171401977539, + "learning_rate": 1.7888999008919725e-06, + "loss": 0.9438, + "step": 777 + }, + { + "epoch": 1.3868092691622103, + "grad_norm": 1.6585170030593872, + "learning_rate": 1.78394449950446e-06, + "loss": 0.9284, + "step": 778 + }, + { + "epoch": 1.3885918003565063, + "grad_norm": 1.8032984733581543, + "learning_rate": 1.7789890981169478e-06, + "loss": 0.9052, + "step": 779 + }, + { + "epoch": 1.3903743315508021, + "grad_norm": 1.7318456172943115, + "learning_rate": 1.7740336967294353e-06, + "loss": 0.9191, + "step": 780 + }, + { + "epoch": 1.3903743315508021, + "eval_loss": 1.0097719430923462, + "eval_runtime": 25.8995, + "eval_samples_per_second": 38.611, + "eval_steps_per_second": 2.432, + "step": 780 + }, + { + "epoch": 1.392156862745098, + "grad_norm": 1.6782145500183105, + "learning_rate": 1.7690782953419227e-06, + "loss": 0.9304, + "step": 781 + }, + { + "epoch": 1.393939393939394, + "grad_norm": 1.6794602870941162, + "learning_rate": 1.7641228939544106e-06, + "loss": 0.8886, + "step": 782 + }, + { + "epoch": 1.3957219251336899, + "grad_norm": 1.6758161783218384, + "learning_rate": 1.759167492566898e-06, + "loss": 0.937, + "step": 783 + }, + { + "epoch": 1.3975044563279857, + "grad_norm": 1.7178926467895508, + "learning_rate": 1.7542120911793855e-06, + "loss": 0.9135, + "step": 784 + }, + { + "epoch": 1.3992869875222818, + "grad_norm": 1.7379509210586548, + "learning_rate": 1.7492566897918734e-06, + "loss": 0.9108, + "step": 785 + }, + { + "epoch": 1.4010695187165776, + "grad_norm": 1.9025062322616577, + "learning_rate": 1.7443012884043608e-06, + "loss": 0.9718, + "step": 786 + }, + { + "epoch": 1.4028520499108734, + "grad_norm": 1.7181079387664795, + "learning_rate": 1.7393458870168487e-06, + "loss": 0.9131, + "step": 787 + }, + { + "epoch": 1.4046345811051695, + "grad_norm": 1.8086258172988892, + "learning_rate": 1.7343904856293361e-06, + "loss": 0.9251, + "step": 788 + }, + { + "epoch": 1.4064171122994653, + "grad_norm": 1.721421718597412, + "learning_rate": 1.7294350842418236e-06, + "loss": 0.9466, + "step": 789 + }, + { + "epoch": 1.4081996434937611, + "grad_norm": 1.7858272790908813, + "learning_rate": 1.7244796828543115e-06, + "loss": 0.9413, + "step": 790 + }, + { + "epoch": 1.4081996434937611, + "eval_loss": 1.0092817544937134, + "eval_runtime": 26.014, + "eval_samples_per_second": 38.441, + "eval_steps_per_second": 2.422, + "step": 790 + }, + { + "epoch": 1.409982174688057, + "grad_norm": 1.6175751686096191, + "learning_rate": 1.719524281466799e-06, + "loss": 0.9282, + "step": 791 + }, + { + "epoch": 1.4117647058823528, + "grad_norm": 1.6843942403793335, + "learning_rate": 1.7145688800792864e-06, + "loss": 0.8947, + "step": 792 + }, + { + "epoch": 1.4135472370766489, + "grad_norm": 1.639022946357727, + "learning_rate": 1.7096134786917742e-06, + "loss": 0.9174, + "step": 793 + }, + { + "epoch": 1.4153297682709447, + "grad_norm": 1.6527246236801147, + "learning_rate": 1.7046580773042617e-06, + "loss": 0.9325, + "step": 794 + }, + { + "epoch": 1.4171122994652405, + "grad_norm": 1.6931911706924438, + "learning_rate": 1.6997026759167496e-06, + "loss": 0.9105, + "step": 795 + }, + { + "epoch": 1.4188948306595366, + "grad_norm": 1.626141905784607, + "learning_rate": 1.694747274529237e-06, + "loss": 0.8941, + "step": 796 + }, + { + "epoch": 1.4206773618538324, + "grad_norm": 1.7113869190216064, + "learning_rate": 1.6897918731417245e-06, + "loss": 0.9012, + "step": 797 + }, + { + "epoch": 1.4224598930481283, + "grad_norm": 1.6680278778076172, + "learning_rate": 1.6848364717542123e-06, + "loss": 0.9116, + "step": 798 + }, + { + "epoch": 1.4242424242424243, + "grad_norm": 1.7071088552474976, + "learning_rate": 1.6798810703666998e-06, + "loss": 0.9217, + "step": 799 + }, + { + "epoch": 1.4260249554367201, + "grad_norm": 1.6858220100402832, + "learning_rate": 1.6749256689791874e-06, + "loss": 0.9277, + "step": 800 + }, + { + "epoch": 1.4260249554367201, + "eval_loss": 1.0078998804092407, + "eval_runtime": 25.8631, + "eval_samples_per_second": 38.665, + "eval_steps_per_second": 2.436, + "step": 800 + }, + { + "epoch": 1.427807486631016, + "grad_norm": 1.6459059715270996, + "learning_rate": 1.669970267591675e-06, + "loss": 0.951, + "step": 801 + }, + { + "epoch": 1.429590017825312, + "grad_norm": 1.6655635833740234, + "learning_rate": 1.6650148662041625e-06, + "loss": 0.921, + "step": 802 + }, + { + "epoch": 1.4313725490196079, + "grad_norm": 1.7144039869308472, + "learning_rate": 1.6600594648166502e-06, + "loss": 0.9186, + "step": 803 + }, + { + "epoch": 1.4331550802139037, + "grad_norm": 1.6617674827575684, + "learning_rate": 1.6551040634291379e-06, + "loss": 0.9002, + "step": 804 + }, + { + "epoch": 1.4349376114081998, + "grad_norm": 1.7272840738296509, + "learning_rate": 1.6501486620416255e-06, + "loss": 0.9253, + "step": 805 + }, + { + "epoch": 1.4367201426024956, + "grad_norm": 1.717220664024353, + "learning_rate": 1.6451932606541132e-06, + "loss": 0.9674, + "step": 806 + }, + { + "epoch": 1.4385026737967914, + "grad_norm": 1.6640911102294922, + "learning_rate": 1.6402378592666006e-06, + "loss": 0.9324, + "step": 807 + }, + { + "epoch": 1.4402852049910875, + "grad_norm": 1.6072384119033813, + "learning_rate": 1.6352824578790883e-06, + "loss": 0.9289, + "step": 808 + }, + { + "epoch": 1.4420677361853833, + "grad_norm": 1.7120065689086914, + "learning_rate": 1.630327056491576e-06, + "loss": 0.8992, + "step": 809 + }, + { + "epoch": 1.4438502673796791, + "grad_norm": 1.6117724180221558, + "learning_rate": 1.6253716551040636e-06, + "loss": 0.8798, + "step": 810 + }, + { + "epoch": 1.4438502673796791, + "eval_loss": 1.0046720504760742, + "eval_runtime": 26.0154, + "eval_samples_per_second": 38.439, + "eval_steps_per_second": 2.422, + "step": 810 + }, + { + "epoch": 1.445632798573975, + "grad_norm": 1.689333200454712, + "learning_rate": 1.620416253716551e-06, + "loss": 0.9071, + "step": 811 + }, + { + "epoch": 1.4474153297682708, + "grad_norm": 1.5870457887649536, + "learning_rate": 1.6154608523290387e-06, + "loss": 0.9308, + "step": 812 + }, + { + "epoch": 1.4491978609625669, + "grad_norm": 1.7409943342208862, + "learning_rate": 1.6105054509415264e-06, + "loss": 0.9348, + "step": 813 + }, + { + "epoch": 1.4509803921568627, + "grad_norm": 1.6735934019088745, + "learning_rate": 1.605550049554014e-06, + "loss": 0.9443, + "step": 814 + }, + { + "epoch": 1.4527629233511585, + "grad_norm": 1.6621036529541016, + "learning_rate": 1.6005946481665015e-06, + "loss": 0.9159, + "step": 815 + }, + { + "epoch": 1.4545454545454546, + "grad_norm": 1.613136887550354, + "learning_rate": 1.5956392467789892e-06, + "loss": 0.9043, + "step": 816 + }, + { + "epoch": 1.4563279857397504, + "grad_norm": 1.560819149017334, + "learning_rate": 1.5906838453914768e-06, + "loss": 0.9034, + "step": 817 + }, + { + "epoch": 1.4581105169340463, + "grad_norm": 1.5957945585250854, + "learning_rate": 1.5857284440039645e-06, + "loss": 0.8977, + "step": 818 + }, + { + "epoch": 1.4598930481283423, + "grad_norm": 1.608081340789795, + "learning_rate": 1.580773042616452e-06, + "loss": 0.888, + "step": 819 + }, + { + "epoch": 1.4616755793226381, + "grad_norm": 1.6894760131835938, + "learning_rate": 1.5758176412289396e-06, + "loss": 0.9139, + "step": 820 + }, + { + "epoch": 1.4616755793226381, + "eval_loss": 1.0035488605499268, + "eval_runtime": 25.9248, + "eval_samples_per_second": 38.573, + "eval_steps_per_second": 2.43, + "step": 820 + }, + { + "epoch": 1.463458110516934, + "grad_norm": 1.691773533821106, + "learning_rate": 1.5708622398414273e-06, + "loss": 0.9202, + "step": 821 + }, + { + "epoch": 1.46524064171123, + "grad_norm": 1.7405924797058105, + "learning_rate": 1.565906838453915e-06, + "loss": 0.9049, + "step": 822 + }, + { + "epoch": 1.4670231729055259, + "grad_norm": 1.6937116384506226, + "learning_rate": 1.5609514370664026e-06, + "loss": 0.9305, + "step": 823 + }, + { + "epoch": 1.4688057040998217, + "grad_norm": 1.7311570644378662, + "learning_rate": 1.55599603567889e-06, + "loss": 0.9446, + "step": 824 + }, + { + "epoch": 1.4705882352941178, + "grad_norm": 1.6122374534606934, + "learning_rate": 1.5510406342913777e-06, + "loss": 0.9097, + "step": 825 + }, + { + "epoch": 1.4723707664884136, + "grad_norm": 1.6009140014648438, + "learning_rate": 1.5460852329038654e-06, + "loss": 0.9213, + "step": 826 + }, + { + "epoch": 1.4741532976827094, + "grad_norm": 1.6482139825820923, + "learning_rate": 1.5411298315163528e-06, + "loss": 0.9446, + "step": 827 + }, + { + "epoch": 1.4759358288770055, + "grad_norm": 1.6383440494537354, + "learning_rate": 1.5361744301288407e-06, + "loss": 0.9326, + "step": 828 + }, + { + "epoch": 1.4777183600713013, + "grad_norm": 1.675575852394104, + "learning_rate": 1.5312190287413281e-06, + "loss": 0.9304, + "step": 829 + }, + { + "epoch": 1.4795008912655971, + "grad_norm": 1.6283584833145142, + "learning_rate": 1.5262636273538158e-06, + "loss": 0.8586, + "step": 830 + }, + { + "epoch": 1.4795008912655971, + "eval_loss": 1.002890944480896, + "eval_runtime": 25.9393, + "eval_samples_per_second": 38.552, + "eval_steps_per_second": 2.429, + "step": 830 + }, + { + "epoch": 1.481283422459893, + "grad_norm": 1.7265905141830444, + "learning_rate": 1.5213082259663035e-06, + "loss": 0.9409, + "step": 831 + }, + { + "epoch": 1.483065953654189, + "grad_norm": 1.659981369972229, + "learning_rate": 1.516352824578791e-06, + "loss": 0.9376, + "step": 832 + }, + { + "epoch": 1.4848484848484849, + "grad_norm": 1.710387945175171, + "learning_rate": 1.5113974231912786e-06, + "loss": 0.9041, + "step": 833 + }, + { + "epoch": 1.4866310160427807, + "grad_norm": 1.6636488437652588, + "learning_rate": 1.5064420218037662e-06, + "loss": 0.9171, + "step": 834 + }, + { + "epoch": 1.4884135472370765, + "grad_norm": 1.7282999753952026, + "learning_rate": 1.5014866204162537e-06, + "loss": 0.9034, + "step": 835 + }, + { + "epoch": 1.4901960784313726, + "grad_norm": 1.7450913190841675, + "learning_rate": 1.4965312190287416e-06, + "loss": 0.9341, + "step": 836 + }, + { + "epoch": 1.4919786096256684, + "grad_norm": 1.667269229888916, + "learning_rate": 1.491575817641229e-06, + "loss": 0.9033, + "step": 837 + }, + { + "epoch": 1.4937611408199643, + "grad_norm": 1.6336359977722168, + "learning_rate": 1.4866204162537167e-06, + "loss": 0.916, + "step": 838 + }, + { + "epoch": 1.4955436720142603, + "grad_norm": 1.5937727689743042, + "learning_rate": 1.4816650148662043e-06, + "loss": 0.9012, + "step": 839 + }, + { + "epoch": 1.4973262032085561, + "grad_norm": 1.717529535293579, + "learning_rate": 1.4767096134786918e-06, + "loss": 0.9337, + "step": 840 + }, + { + "epoch": 1.4973262032085561, + "eval_loss": 1.0023201704025269, + "eval_runtime": 25.9528, + "eval_samples_per_second": 38.532, + "eval_steps_per_second": 2.427, + "step": 840 + }, + { + "epoch": 1.499108734402852, + "grad_norm": 1.6657592058181763, + "learning_rate": 1.4717542120911797e-06, + "loss": 0.939, + "step": 841 + }, + { + "epoch": 1.500891265597148, + "grad_norm": 1.7361196279525757, + "learning_rate": 1.4667988107036671e-06, + "loss": 0.9348, + "step": 842 + }, + { + "epoch": 1.5026737967914439, + "grad_norm": 1.6162766218185425, + "learning_rate": 1.4618434093161546e-06, + "loss": 0.9382, + "step": 843 + }, + { + "epoch": 1.5044563279857397, + "grad_norm": 1.639491319656372, + "learning_rate": 1.4568880079286424e-06, + "loss": 0.9133, + "step": 844 + }, + { + "epoch": 1.5062388591800357, + "grad_norm": 1.5712809562683105, + "learning_rate": 1.4519326065411299e-06, + "loss": 0.9152, + "step": 845 + }, + { + "epoch": 1.5080213903743316, + "grad_norm": 1.639657974243164, + "learning_rate": 1.4469772051536178e-06, + "loss": 0.9398, + "step": 846 + }, + { + "epoch": 1.5098039215686274, + "grad_norm": 1.664557695388794, + "learning_rate": 1.4420218037661052e-06, + "loss": 0.8776, + "step": 847 + }, + { + "epoch": 1.5115864527629235, + "grad_norm": 1.6303818225860596, + "learning_rate": 1.4370664023785927e-06, + "loss": 0.9392, + "step": 848 + }, + { + "epoch": 1.5133689839572193, + "grad_norm": 1.6585919857025146, + "learning_rate": 1.4321110009910805e-06, + "loss": 0.9181, + "step": 849 + }, + { + "epoch": 1.5151515151515151, + "grad_norm": 1.7727773189544678, + "learning_rate": 1.427155599603568e-06, + "loss": 0.9333, + "step": 850 + }, + { + "epoch": 1.5151515151515151, + "eval_loss": 1.0036695003509521, + "eval_runtime": 25.8611, + "eval_samples_per_second": 38.668, + "eval_steps_per_second": 2.436, + "step": 850 + }, + { + "epoch": 1.5169340463458112, + "grad_norm": 1.6858317852020264, + "learning_rate": 1.4222001982160554e-06, + "loss": 0.9017, + "step": 851 + }, + { + "epoch": 1.5187165775401068, + "grad_norm": 1.6429566144943237, + "learning_rate": 1.4172447968285433e-06, + "loss": 0.9047, + "step": 852 + }, + { + "epoch": 1.5204991087344029, + "grad_norm": 1.877684235572815, + "learning_rate": 1.4122893954410308e-06, + "loss": 0.9314, + "step": 853 + }, + { + "epoch": 1.522281639928699, + "grad_norm": 1.6848722696304321, + "learning_rate": 1.4073339940535186e-06, + "loss": 0.914, + "step": 854 + }, + { + "epoch": 1.5240641711229945, + "grad_norm": 1.7669544219970703, + "learning_rate": 1.402378592666006e-06, + "loss": 0.9158, + "step": 855 + }, + { + "epoch": 1.5258467023172906, + "grad_norm": 1.6447798013687134, + "learning_rate": 1.3974231912784935e-06, + "loss": 0.9603, + "step": 856 + }, + { + "epoch": 1.5276292335115864, + "grad_norm": 1.6374293565750122, + "learning_rate": 1.3924677898909814e-06, + "loss": 0.9168, + "step": 857 + }, + { + "epoch": 1.5294117647058822, + "grad_norm": 1.614821434020996, + "learning_rate": 1.3875123885034689e-06, + "loss": 0.9262, + "step": 858 + }, + { + "epoch": 1.5311942959001783, + "grad_norm": 1.6780483722686768, + "learning_rate": 1.3825569871159563e-06, + "loss": 0.9218, + "step": 859 + }, + { + "epoch": 1.5329768270944741, + "grad_norm": 1.727554202079773, + "learning_rate": 1.3776015857284442e-06, + "loss": 0.9085, + "step": 860 + }, + { + "epoch": 1.5329768270944741, + "eval_loss": 1.0021950006484985, + "eval_runtime": 25.7933, + "eval_samples_per_second": 38.77, + "eval_steps_per_second": 2.442, + "step": 860 + }, + { + "epoch": 1.53475935828877, + "grad_norm": 1.6244946718215942, + "learning_rate": 1.3726461843409316e-06, + "loss": 0.9083, + "step": 861 + }, + { + "epoch": 1.536541889483066, + "grad_norm": 1.7828315496444702, + "learning_rate": 1.3676907829534195e-06, + "loss": 0.9258, + "step": 862 + }, + { + "epoch": 1.5383244206773619, + "grad_norm": 1.6405810117721558, + "learning_rate": 1.362735381565907e-06, + "loss": 0.9407, + "step": 863 + }, + { + "epoch": 1.5401069518716577, + "grad_norm": 1.7271147966384888, + "learning_rate": 1.3577799801783944e-06, + "loss": 0.9375, + "step": 864 + }, + { + "epoch": 1.5418894830659537, + "grad_norm": 1.789184808731079, + "learning_rate": 1.3528245787908823e-06, + "loss": 0.9062, + "step": 865 + }, + { + "epoch": 1.5436720142602496, + "grad_norm": 1.7284061908721924, + "learning_rate": 1.3478691774033697e-06, + "loss": 0.901, + "step": 866 + }, + { + "epoch": 1.5454545454545454, + "grad_norm": 1.7288529872894287, + "learning_rate": 1.3429137760158572e-06, + "loss": 0.8885, + "step": 867 + }, + { + "epoch": 1.5472370766488415, + "grad_norm": 1.71056067943573, + "learning_rate": 1.337958374628345e-06, + "loss": 0.8895, + "step": 868 + }, + { + "epoch": 1.5490196078431373, + "grad_norm": 1.8451443910598755, + "learning_rate": 1.3330029732408325e-06, + "loss": 0.9351, + "step": 869 + }, + { + "epoch": 1.5508021390374331, + "grad_norm": 1.7918177843093872, + "learning_rate": 1.3280475718533204e-06, + "loss": 0.9022, + "step": 870 + }, + { + "epoch": 1.5508021390374331, + "eval_loss": 1.0015515089035034, + "eval_runtime": 25.6272, + "eval_samples_per_second": 39.021, + "eval_steps_per_second": 2.458, + "step": 870 + }, + { + "epoch": 1.5525846702317292, + "grad_norm": 1.7702810764312744, + "learning_rate": 1.3230921704658078e-06, + "loss": 0.9402, + "step": 871 + }, + { + "epoch": 1.5543672014260248, + "grad_norm": 1.6390447616577148, + "learning_rate": 1.3181367690782953e-06, + "loss": 0.8972, + "step": 872 + }, + { + "epoch": 1.5561497326203209, + "grad_norm": 1.6672519445419312, + "learning_rate": 1.3131813676907832e-06, + "loss": 0.9124, + "step": 873 + }, + { + "epoch": 1.557932263814617, + "grad_norm": 1.7509058713912964, + "learning_rate": 1.3082259663032706e-06, + "loss": 0.8998, + "step": 874 + }, + { + "epoch": 1.5597147950089125, + "grad_norm": 1.7143068313598633, + "learning_rate": 1.303270564915758e-06, + "loss": 0.9138, + "step": 875 + }, + { + "epoch": 1.5614973262032086, + "grad_norm": 1.778315544128418, + "learning_rate": 1.298315163528246e-06, + "loss": 0.9245, + "step": 876 + }, + { + "epoch": 1.5632798573975044, + "grad_norm": 1.7348226308822632, + "learning_rate": 1.2933597621407334e-06, + "loss": 0.9192, + "step": 877 + }, + { + "epoch": 1.5650623885918002, + "grad_norm": 1.6273071765899658, + "learning_rate": 1.2884043607532213e-06, + "loss": 0.9332, + "step": 878 + }, + { + "epoch": 1.5668449197860963, + "grad_norm": 1.9141535758972168, + "learning_rate": 1.2834489593657087e-06, + "loss": 0.932, + "step": 879 + }, + { + "epoch": 1.5686274509803921, + "grad_norm": 1.7379990816116333, + "learning_rate": 1.2784935579781962e-06, + "loss": 0.9357, + "step": 880 + }, + { + "epoch": 1.5686274509803921, + "eval_loss": 1.0007495880126953, + "eval_runtime": 25.606, + "eval_samples_per_second": 39.053, + "eval_steps_per_second": 2.46, + "step": 880 + }, + { + "epoch": 1.570409982174688, + "grad_norm": 1.6585184335708618, + "learning_rate": 1.273538156590684e-06, + "loss": 0.8919, + "step": 881 + }, + { + "epoch": 1.572192513368984, + "grad_norm": 1.6785247325897217, + "learning_rate": 1.2685827552031715e-06, + "loss": 0.914, + "step": 882 + }, + { + "epoch": 1.5739750445632799, + "grad_norm": 1.7640776634216309, + "learning_rate": 1.2636273538156591e-06, + "loss": 0.9166, + "step": 883 + }, + { + "epoch": 1.5757575757575757, + "grad_norm": 1.8093239068984985, + "learning_rate": 1.2586719524281468e-06, + "loss": 0.9098, + "step": 884 + }, + { + "epoch": 1.5775401069518717, + "grad_norm": 1.623347520828247, + "learning_rate": 1.2537165510406342e-06, + "loss": 0.8963, + "step": 885 + }, + { + "epoch": 1.5793226381461676, + "grad_norm": 1.7071688175201416, + "learning_rate": 1.248761149653122e-06, + "loss": 0.9077, + "step": 886 + }, + { + "epoch": 1.5811051693404634, + "grad_norm": 1.6982461214065552, + "learning_rate": 1.2438057482656096e-06, + "loss": 0.897, + "step": 887 + }, + { + "epoch": 1.5828877005347595, + "grad_norm": 1.7919094562530518, + "learning_rate": 1.2388503468780972e-06, + "loss": 0.904, + "step": 888 + }, + { + "epoch": 1.5846702317290553, + "grad_norm": 1.7249748706817627, + "learning_rate": 1.2338949454905847e-06, + "loss": 0.9144, + "step": 889 + }, + { + "epoch": 1.5864527629233511, + "grad_norm": 1.6791118383407593, + "learning_rate": 1.2289395441030723e-06, + "loss": 0.9086, + "step": 890 + }, + { + "epoch": 1.5864527629233511, + "eval_loss": 0.9997517466545105, + "eval_runtime": 25.5898, + "eval_samples_per_second": 39.078, + "eval_steps_per_second": 2.462, + "step": 890 + }, + { + "epoch": 1.5882352941176472, + "grad_norm": 1.78848135471344, + "learning_rate": 1.22398414271556e-06, + "loss": 0.9264, + "step": 891 + }, + { + "epoch": 1.5900178253119428, + "grad_norm": 1.6467989683151245, + "learning_rate": 1.2190287413280477e-06, + "loss": 0.9457, + "step": 892 + }, + { + "epoch": 1.5918003565062389, + "grad_norm": 1.7058672904968262, + "learning_rate": 1.2140733399405351e-06, + "loss": 0.8898, + "step": 893 + }, + { + "epoch": 1.593582887700535, + "grad_norm": 1.7215371131896973, + "learning_rate": 1.2091179385530228e-06, + "loss": 0.954, + "step": 894 + }, + { + "epoch": 1.5953654188948305, + "grad_norm": 1.7038347721099854, + "learning_rate": 1.2041625371655104e-06, + "loss": 0.9021, + "step": 895 + }, + { + "epoch": 1.5971479500891266, + "grad_norm": 1.6531912088394165, + "learning_rate": 1.1992071357779981e-06, + "loss": 0.8992, + "step": 896 + }, + { + "epoch": 1.5989304812834224, + "grad_norm": 1.6344718933105469, + "learning_rate": 1.1942517343904858e-06, + "loss": 0.8755, + "step": 897 + }, + { + "epoch": 1.6007130124777182, + "grad_norm": 1.667981743812561, + "learning_rate": 1.1892963330029732e-06, + "loss": 0.9112, + "step": 898 + }, + { + "epoch": 1.6024955436720143, + "grad_norm": 1.7452551126480103, + "learning_rate": 1.1843409316154609e-06, + "loss": 0.9181, + "step": 899 + }, + { + "epoch": 1.6042780748663101, + "grad_norm": 1.6953660249710083, + "learning_rate": 1.1793855302279485e-06, + "loss": 0.9109, + "step": 900 + }, + { + "epoch": 1.6042780748663101, + "eval_loss": 1.0002549886703491, + "eval_runtime": 25.6596, + "eval_samples_per_second": 38.972, + "eval_steps_per_second": 2.455, + "step": 900 + }, + { + "epoch": 1.606060606060606, + "grad_norm": 1.733874797821045, + "learning_rate": 1.1744301288404362e-06, + "loss": 0.9067, + "step": 901 + }, + { + "epoch": 1.607843137254902, + "grad_norm": 1.6959171295166016, + "learning_rate": 1.1694747274529237e-06, + "loss": 0.8932, + "step": 902 + }, + { + "epoch": 1.6096256684491979, + "grad_norm": 1.7267096042633057, + "learning_rate": 1.1645193260654113e-06, + "loss": 0.9315, + "step": 903 + }, + { + "epoch": 1.6114081996434937, + "grad_norm": 1.755772352218628, + "learning_rate": 1.159563924677899e-06, + "loss": 0.9187, + "step": 904 + }, + { + "epoch": 1.6131907308377897, + "grad_norm": 1.570746898651123, + "learning_rate": 1.1546085232903866e-06, + "loss": 0.8931, + "step": 905 + }, + { + "epoch": 1.6149732620320856, + "grad_norm": 1.7020586729049683, + "learning_rate": 1.1496531219028743e-06, + "loss": 0.9247, + "step": 906 + }, + { + "epoch": 1.6167557932263814, + "grad_norm": 1.6069488525390625, + "learning_rate": 1.1446977205153618e-06, + "loss": 0.8965, + "step": 907 + }, + { + "epoch": 1.6185383244206775, + "grad_norm": 1.6286472082138062, + "learning_rate": 1.1397423191278494e-06, + "loss": 0.9191, + "step": 908 + }, + { + "epoch": 1.6203208556149733, + "grad_norm": 1.5472110509872437, + "learning_rate": 1.134786917740337e-06, + "loss": 0.8944, + "step": 909 + }, + { + "epoch": 1.6221033868092691, + "grad_norm": 1.8870779275894165, + "learning_rate": 1.1298315163528247e-06, + "loss": 0.94, + "step": 910 + }, + { + "epoch": 1.6221033868092691, + "eval_loss": 0.9995793700218201, + "eval_runtime": 25.6625, + "eval_samples_per_second": 38.967, + "eval_steps_per_second": 2.455, + "step": 910 + }, + { + "epoch": 1.6238859180035652, + "grad_norm": 1.745804786682129, + "learning_rate": 1.1248761149653122e-06, + "loss": 0.9077, + "step": 911 + }, + { + "epoch": 1.6256684491978608, + "grad_norm": 1.6671708822250366, + "learning_rate": 1.1199207135777999e-06, + "loss": 0.909, + "step": 912 + }, + { + "epoch": 1.6274509803921569, + "grad_norm": 1.612488865852356, + "learning_rate": 1.1149653121902875e-06, + "loss": 0.9034, + "step": 913 + }, + { + "epoch": 1.629233511586453, + "grad_norm": 1.620811104774475, + "learning_rate": 1.1100099108027752e-06, + "loss": 0.9386, + "step": 914 + }, + { + "epoch": 1.6310160427807485, + "grad_norm": 1.6019529104232788, + "learning_rate": 1.1050545094152628e-06, + "loss": 0.8877, + "step": 915 + }, + { + "epoch": 1.6327985739750446, + "grad_norm": 1.613588809967041, + "learning_rate": 1.1000991080277503e-06, + "loss": 0.9113, + "step": 916 + }, + { + "epoch": 1.6345811051693404, + "grad_norm": 1.5803701877593994, + "learning_rate": 1.095143706640238e-06, + "loss": 0.896, + "step": 917 + }, + { + "epoch": 1.6363636363636362, + "grad_norm": 1.7000629901885986, + "learning_rate": 1.0901883052527256e-06, + "loss": 0.9184, + "step": 918 + }, + { + "epoch": 1.6381461675579323, + "grad_norm": 1.5939161777496338, + "learning_rate": 1.0852329038652133e-06, + "loss": 0.9119, + "step": 919 + }, + { + "epoch": 1.6399286987522281, + "grad_norm": 1.7102794647216797, + "learning_rate": 1.080277502477701e-06, + "loss": 0.9483, + "step": 920 + }, + { + "epoch": 1.6399286987522281, + "eval_loss": 0.9989098906517029, + "eval_runtime": 25.7806, + "eval_samples_per_second": 38.789, + "eval_steps_per_second": 2.444, + "step": 920 + }, + { + "epoch": 1.641711229946524, + "grad_norm": 1.713068962097168, + "learning_rate": 1.0753221010901884e-06, + "loss": 0.9219, + "step": 921 + }, + { + "epoch": 1.64349376114082, + "grad_norm": 1.6815000772476196, + "learning_rate": 1.070366699702676e-06, + "loss": 0.9216, + "step": 922 + }, + { + "epoch": 1.6452762923351159, + "grad_norm": 1.6452995538711548, + "learning_rate": 1.0654112983151637e-06, + "loss": 0.8934, + "step": 923 + }, + { + "epoch": 1.6470588235294117, + "grad_norm": 1.6780235767364502, + "learning_rate": 1.0604558969276514e-06, + "loss": 0.9355, + "step": 924 + }, + { + "epoch": 1.6488413547237077, + "grad_norm": 1.781059980392456, + "learning_rate": 1.0555004955401388e-06, + "loss": 0.9272, + "step": 925 + }, + { + "epoch": 1.6506238859180036, + "grad_norm": 1.7290118932724, + "learning_rate": 1.0505450941526265e-06, + "loss": 0.89, + "step": 926 + }, + { + "epoch": 1.6524064171122994, + "grad_norm": 1.6904546022415161, + "learning_rate": 1.0455896927651141e-06, + "loss": 0.8923, + "step": 927 + }, + { + "epoch": 1.6541889483065955, + "grad_norm": 1.7392773628234863, + "learning_rate": 1.0406342913776018e-06, + "loss": 0.92, + "step": 928 + }, + { + "epoch": 1.6559714795008913, + "grad_norm": 1.676623821258545, + "learning_rate": 1.0356788899900893e-06, + "loss": 0.9204, + "step": 929 + }, + { + "epoch": 1.6577540106951871, + "grad_norm": 1.813863754272461, + "learning_rate": 1.030723488602577e-06, + "loss": 0.9442, + "step": 930 + }, + { + "epoch": 1.6577540106951871, + "eval_loss": 0.9988206624984741, + "eval_runtime": 25.7006, + "eval_samples_per_second": 38.91, + "eval_steps_per_second": 2.451, + "step": 930 + }, + { + "epoch": 1.6595365418894832, + "grad_norm": 1.7283135652542114, + "learning_rate": 1.0257680872150646e-06, + "loss": 0.9189, + "step": 931 + }, + { + "epoch": 1.661319073083779, + "grad_norm": 1.6723238229751587, + "learning_rate": 1.0208126858275522e-06, + "loss": 0.9173, + "step": 932 + }, + { + "epoch": 1.6631016042780749, + "grad_norm": 1.6681349277496338, + "learning_rate": 1.0158572844400397e-06, + "loss": 0.8979, + "step": 933 + }, + { + "epoch": 1.664884135472371, + "grad_norm": 1.6180405616760254, + "learning_rate": 1.0109018830525274e-06, + "loss": 0.9034, + "step": 934 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 1.7722523212432861, + "learning_rate": 1.005946481665015e-06, + "loss": 0.9223, + "step": 935 + }, + { + "epoch": 1.6684491978609626, + "grad_norm": 1.613845705986023, + "learning_rate": 1.0009910802775027e-06, + "loss": 0.9204, + "step": 936 + }, + { + "epoch": 1.6702317290552586, + "grad_norm": 1.751266360282898, + "learning_rate": 9.960356788899901e-07, + "loss": 0.9552, + "step": 937 + }, + { + "epoch": 1.6720142602495542, + "grad_norm": 1.6438246965408325, + "learning_rate": 9.910802775024778e-07, + "loss": 0.9023, + "step": 938 + }, + { + "epoch": 1.6737967914438503, + "grad_norm": 1.7301279306411743, + "learning_rate": 9.861248761149655e-07, + "loss": 0.9408, + "step": 939 + }, + { + "epoch": 1.6755793226381461, + "grad_norm": 1.5920662879943848, + "learning_rate": 9.811694747274531e-07, + "loss": 0.8898, + "step": 940 + }, + { + "epoch": 1.6755793226381461, + "eval_loss": 0.9970239400863647, + "eval_runtime": 25.6234, + "eval_samples_per_second": 39.027, + "eval_steps_per_second": 2.459, + "step": 940 + }, + { + "epoch": 1.677361853832442, + "grad_norm": 1.7048591375350952, + "learning_rate": 9.762140733399406e-07, + "loss": 0.9092, + "step": 941 + }, + { + "epoch": 1.679144385026738, + "grad_norm": 1.7565436363220215, + "learning_rate": 9.712586719524282e-07, + "loss": 0.9229, + "step": 942 + }, + { + "epoch": 1.6809269162210339, + "grad_norm": 1.7713767290115356, + "learning_rate": 9.663032705649159e-07, + "loss": 0.9253, + "step": 943 + }, + { + "epoch": 1.6827094474153297, + "grad_norm": 1.864357352256775, + "learning_rate": 9.613478691774035e-07, + "loss": 0.9322, + "step": 944 + }, + { + "epoch": 1.6844919786096257, + "grad_norm": 1.6992820501327515, + "learning_rate": 9.56392467789891e-07, + "loss": 0.8836, + "step": 945 + }, + { + "epoch": 1.6862745098039216, + "grad_norm": 1.6879699230194092, + "learning_rate": 9.514370664023787e-07, + "loss": 0.9119, + "step": 946 + }, + { + "epoch": 1.6880570409982174, + "grad_norm": 1.6944903135299683, + "learning_rate": 9.464816650148663e-07, + "loss": 0.9365, + "step": 947 + }, + { + "epoch": 1.6898395721925135, + "grad_norm": 1.694254994392395, + "learning_rate": 9.41526263627354e-07, + "loss": 0.9524, + "step": 948 + }, + { + "epoch": 1.6916221033868093, + "grad_norm": 1.6590172052383423, + "learning_rate": 9.365708622398414e-07, + "loss": 0.9276, + "step": 949 + }, + { + "epoch": 1.6934046345811051, + "grad_norm": 1.7332650423049927, + "learning_rate": 9.316154608523291e-07, + "loss": 0.9267, + "step": 950 + }, + { + "epoch": 1.6934046345811051, + "eval_loss": 0.9965516328811646, + "eval_runtime": 25.7077, + "eval_samples_per_second": 38.899, + "eval_steps_per_second": 2.451, + "step": 950 + }, + { + "epoch": 1.6951871657754012, + "grad_norm": 1.617246389389038, + "learning_rate": 9.266600594648168e-07, + "loss": 0.9161, + "step": 951 + }, + { + "epoch": 1.696969696969697, + "grad_norm": 1.6363615989685059, + "learning_rate": 9.217046580773044e-07, + "loss": 0.9123, + "step": 952 + }, + { + "epoch": 1.6987522281639929, + "grad_norm": 1.6289929151535034, + "learning_rate": 9.167492566897919e-07, + "loss": 0.9148, + "step": 953 + }, + { + "epoch": 1.700534759358289, + "grad_norm": 1.7203320264816284, + "learning_rate": 9.117938553022795e-07, + "loss": 0.9035, + "step": 954 + }, + { + "epoch": 1.7023172905525845, + "grad_norm": 1.6997250318527222, + "learning_rate": 9.068384539147672e-07, + "loss": 0.9123, + "step": 955 + }, + { + "epoch": 1.7040998217468806, + "grad_norm": 1.7788165807724, + "learning_rate": 9.018830525272549e-07, + "loss": 0.9198, + "step": 956 + }, + { + "epoch": 1.7058823529411766, + "grad_norm": 1.708892822265625, + "learning_rate": 8.969276511397423e-07, + "loss": 0.8984, + "step": 957 + }, + { + "epoch": 1.7076648841354722, + "grad_norm": 1.6467701196670532, + "learning_rate": 8.9197224975223e-07, + "loss": 0.9046, + "step": 958 + }, + { + "epoch": 1.7094474153297683, + "grad_norm": 1.7599132061004639, + "learning_rate": 8.870168483647176e-07, + "loss": 0.9361, + "step": 959 + }, + { + "epoch": 1.7112299465240641, + "grad_norm": 1.6088155508041382, + "learning_rate": 8.820614469772053e-07, + "loss": 0.8796, + "step": 960 + }, + { + "epoch": 1.7112299465240641, + "eval_loss": 0.9957713484764099, + "eval_runtime": 25.9512, + "eval_samples_per_second": 38.534, + "eval_steps_per_second": 2.428, + "step": 960 + }, + { + "epoch": 1.71301247771836, + "grad_norm": 1.698625087738037, + "learning_rate": 8.771060455896927e-07, + "loss": 0.8925, + "step": 961 + }, + { + "epoch": 1.714795008912656, + "grad_norm": 1.6810694932937622, + "learning_rate": 8.721506442021804e-07, + "loss": 0.908, + "step": 962 + }, + { + "epoch": 1.7165775401069518, + "grad_norm": 1.6146069765090942, + "learning_rate": 8.671952428146681e-07, + "loss": 0.9064, + "step": 963 + }, + { + "epoch": 1.7183600713012477, + "grad_norm": 1.6394168138504028, + "learning_rate": 8.622398414271557e-07, + "loss": 0.9309, + "step": 964 + }, + { + "epoch": 1.7201426024955437, + "grad_norm": 1.6711269617080688, + "learning_rate": 8.572844400396432e-07, + "loss": 0.9436, + "step": 965 + }, + { + "epoch": 1.7219251336898396, + "grad_norm": 1.6253998279571533, + "learning_rate": 8.523290386521308e-07, + "loss": 0.93, + "step": 966 + }, + { + "epoch": 1.7237076648841354, + "grad_norm": 1.7100143432617188, + "learning_rate": 8.473736372646185e-07, + "loss": 0.9047, + "step": 967 + }, + { + "epoch": 1.7254901960784315, + "grad_norm": 1.6870434284210205, + "learning_rate": 8.424182358771062e-07, + "loss": 0.8878, + "step": 968 + }, + { + "epoch": 1.7272727272727273, + "grad_norm": 1.6940484046936035, + "learning_rate": 8.374628344895937e-07, + "loss": 0.9092, + "step": 969 + }, + { + "epoch": 1.7290552584670231, + "grad_norm": 1.7144818305969238, + "learning_rate": 8.325074331020813e-07, + "loss": 0.8831, + "step": 970 + }, + { + "epoch": 1.7290552584670231, + "eval_loss": 0.996261715888977, + "eval_runtime": 25.6913, + "eval_samples_per_second": 38.924, + "eval_steps_per_second": 2.452, + "step": 970 + }, + { + "epoch": 1.7308377896613192, + "grad_norm": 1.7108317613601685, + "learning_rate": 8.275520317145689e-07, + "loss": 0.8759, + "step": 971 + }, + { + "epoch": 1.732620320855615, + "grad_norm": 1.8309522867202759, + "learning_rate": 8.225966303270566e-07, + "loss": 0.8997, + "step": 972 + }, + { + "epoch": 1.7344028520499108, + "grad_norm": 1.6917412281036377, + "learning_rate": 8.176412289395442e-07, + "loss": 0.9109, + "step": 973 + }, + { + "epoch": 1.736185383244207, + "grad_norm": 1.776153802871704, + "learning_rate": 8.126858275520318e-07, + "loss": 0.9283, + "step": 974 + }, + { + "epoch": 1.7379679144385025, + "grad_norm": 1.7653323411941528, + "learning_rate": 8.077304261645194e-07, + "loss": 0.8964, + "step": 975 + }, + { + "epoch": 1.7397504456327986, + "grad_norm": 1.7760730981826782, + "learning_rate": 8.02775024777007e-07, + "loss": 0.9283, + "step": 976 + }, + { + "epoch": 1.7415329768270946, + "grad_norm": 1.7228769063949585, + "learning_rate": 7.978196233894946e-07, + "loss": 0.905, + "step": 977 + }, + { + "epoch": 1.7433155080213902, + "grad_norm": 1.7163366079330444, + "learning_rate": 7.928642220019823e-07, + "loss": 0.9439, + "step": 978 + }, + { + "epoch": 1.7450980392156863, + "grad_norm": 1.750165581703186, + "learning_rate": 7.879088206144698e-07, + "loss": 0.8971, + "step": 979 + }, + { + "epoch": 1.7468805704099821, + "grad_norm": 1.7466638088226318, + "learning_rate": 7.829534192269575e-07, + "loss": 0.9069, + "step": 980 + }, + { + "epoch": 1.7468805704099821, + "eval_loss": 0.9950440526008606, + "eval_runtime": 25.7702, + "eval_samples_per_second": 38.805, + "eval_steps_per_second": 2.445, + "step": 980 + }, + { + "epoch": 1.748663101604278, + "grad_norm": 1.672875165939331, + "learning_rate": 7.77998017839445e-07, + "loss": 0.9218, + "step": 981 + }, + { + "epoch": 1.750445632798574, + "grad_norm": 1.7255640029907227, + "learning_rate": 7.730426164519327e-07, + "loss": 0.9122, + "step": 982 + }, + { + "epoch": 1.7522281639928698, + "grad_norm": 1.6890486478805542, + "learning_rate": 7.680872150644203e-07, + "loss": 0.9098, + "step": 983 + }, + { + "epoch": 1.7540106951871657, + "grad_norm": 1.6408272981643677, + "learning_rate": 7.631318136769079e-07, + "loss": 0.921, + "step": 984 + }, + { + "epoch": 1.7557932263814617, + "grad_norm": 1.6204485893249512, + "learning_rate": 7.581764122893955e-07, + "loss": 0.9109, + "step": 985 + }, + { + "epoch": 1.7575757575757576, + "grad_norm": 1.645685076713562, + "learning_rate": 7.532210109018831e-07, + "loss": 0.9066, + "step": 986 + }, + { + "epoch": 1.7593582887700534, + "grad_norm": 1.7292003631591797, + "learning_rate": 7.482656095143708e-07, + "loss": 0.8945, + "step": 987 + }, + { + "epoch": 1.7611408199643495, + "grad_norm": 1.6503880023956299, + "learning_rate": 7.433102081268583e-07, + "loss": 0.8997, + "step": 988 + }, + { + "epoch": 1.7629233511586453, + "grad_norm": 1.7059553861618042, + "learning_rate": 7.383548067393459e-07, + "loss": 0.9076, + "step": 989 + }, + { + "epoch": 1.7647058823529411, + "grad_norm": 1.6760096549987793, + "learning_rate": 7.333994053518336e-07, + "loss": 0.9466, + "step": 990 + }, + { + "epoch": 1.7647058823529411, + "eval_loss": 0.995206892490387, + "eval_runtime": 25.6123, + "eval_samples_per_second": 39.044, + "eval_steps_per_second": 2.46, + "step": 990 + }, + { + "epoch": 1.7664884135472372, + "grad_norm": 1.683337926864624, + "learning_rate": 7.284440039643212e-07, + "loss": 0.8894, + "step": 991 + }, + { + "epoch": 1.768270944741533, + "grad_norm": 1.7856427431106567, + "learning_rate": 7.234886025768089e-07, + "loss": 0.891, + "step": 992 + }, + { + "epoch": 1.7700534759358288, + "grad_norm": 1.7322144508361816, + "learning_rate": 7.185332011892963e-07, + "loss": 0.9189, + "step": 993 + }, + { + "epoch": 1.771836007130125, + "grad_norm": 1.6664427518844604, + "learning_rate": 7.13577799801784e-07, + "loss": 0.8929, + "step": 994 + }, + { + "epoch": 1.7736185383244205, + "grad_norm": 1.6912422180175781, + "learning_rate": 7.086223984142717e-07, + "loss": 0.8925, + "step": 995 + }, + { + "epoch": 1.7754010695187166, + "grad_norm": 1.8387762308120728, + "learning_rate": 7.036669970267593e-07, + "loss": 0.9081, + "step": 996 + }, + { + "epoch": 1.7771836007130126, + "grad_norm": 1.683955430984497, + "learning_rate": 6.987115956392468e-07, + "loss": 0.8802, + "step": 997 + }, + { + "epoch": 1.7789661319073082, + "grad_norm": 1.760162115097046, + "learning_rate": 6.937561942517344e-07, + "loss": 0.9419, + "step": 998 + }, + { + "epoch": 1.7807486631016043, + "grad_norm": 1.680673599243164, + "learning_rate": 6.888007928642221e-07, + "loss": 0.9024, + "step": 999 + }, + { + "epoch": 1.7825311942959001, + "grad_norm": 1.6909873485565186, + "learning_rate": 6.838453914767098e-07, + "loss": 0.9069, + "step": 1000 + }, + { + "epoch": 1.7825311942959001, + "eval_loss": 0.9940561652183533, + "eval_runtime": 25.7043, + "eval_samples_per_second": 38.904, + "eval_steps_per_second": 2.451, + "step": 1000 + }, + { + "epoch": 1.784313725490196, + "grad_norm": 1.7442917823791504, + "learning_rate": 6.788899900891972e-07, + "loss": 0.9188, + "step": 1001 + }, + { + "epoch": 1.786096256684492, + "grad_norm": 1.658192753791809, + "learning_rate": 6.739345887016849e-07, + "loss": 0.9085, + "step": 1002 + }, + { + "epoch": 1.7878787878787878, + "grad_norm": 1.6885861158370972, + "learning_rate": 6.689791873141725e-07, + "loss": 0.8958, + "step": 1003 + }, + { + "epoch": 1.7896613190730837, + "grad_norm": 1.7260538339614868, + "learning_rate": 6.640237859266602e-07, + "loss": 0.9086, + "step": 1004 + }, + { + "epoch": 1.7914438502673797, + "grad_norm": 1.752336859703064, + "learning_rate": 6.590683845391476e-07, + "loss": 0.8825, + "step": 1005 + }, + { + "epoch": 1.7932263814616756, + "grad_norm": 1.6466569900512695, + "learning_rate": 6.541129831516353e-07, + "loss": 0.8537, + "step": 1006 + }, + { + "epoch": 1.7950089126559714, + "grad_norm": 1.669892430305481, + "learning_rate": 6.49157581764123e-07, + "loss": 0.9325, + "step": 1007 + }, + { + "epoch": 1.7967914438502675, + "grad_norm": 1.8119384050369263, + "learning_rate": 6.442021803766106e-07, + "loss": 0.9359, + "step": 1008 + }, + { + "epoch": 1.7985739750445633, + "grad_norm": 1.6587778329849243, + "learning_rate": 6.392467789890981e-07, + "loss": 0.9183, + "step": 1009 + }, + { + "epoch": 1.8003565062388591, + "grad_norm": 1.6684181690216064, + "learning_rate": 6.342913776015857e-07, + "loss": 0.8833, + "step": 1010 + }, + { + "epoch": 1.8003565062388591, + "eval_loss": 0.9942149519920349, + "eval_runtime": 25.6683, + "eval_samples_per_second": 38.959, + "eval_steps_per_second": 2.454, + "step": 1010 + }, + { + "epoch": 1.8021390374331552, + "grad_norm": 1.7912577390670776, + "learning_rate": 6.293359762140734e-07, + "loss": 0.937, + "step": 1011 + }, + { + "epoch": 1.803921568627451, + "grad_norm": 1.7254101037979126, + "learning_rate": 6.24380574826561e-07, + "loss": 0.9412, + "step": 1012 + }, + { + "epoch": 1.8057040998217468, + "grad_norm": 1.7295520305633545, + "learning_rate": 6.194251734390486e-07, + "loss": 0.9308, + "step": 1013 + }, + { + "epoch": 1.807486631016043, + "grad_norm": 1.736169695854187, + "learning_rate": 6.144697720515362e-07, + "loss": 0.8654, + "step": 1014 + }, + { + "epoch": 1.8092691622103387, + "grad_norm": 1.6945925951004028, + "learning_rate": 6.095143706640238e-07, + "loss": 0.9093, + "step": 1015 + }, + { + "epoch": 1.8110516934046346, + "grad_norm": 1.674950361251831, + "learning_rate": 6.045589692765114e-07, + "loss": 0.8958, + "step": 1016 + }, + { + "epoch": 1.8128342245989306, + "grad_norm": 1.73616361618042, + "learning_rate": 5.996035678889991e-07, + "loss": 0.9306, + "step": 1017 + }, + { + "epoch": 1.8146167557932262, + "grad_norm": 1.694348931312561, + "learning_rate": 5.946481665014866e-07, + "loss": 0.9172, + "step": 1018 + }, + { + "epoch": 1.8163992869875223, + "grad_norm": 1.7344218492507935, + "learning_rate": 5.896927651139743e-07, + "loss": 0.8831, + "step": 1019 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 1.6549162864685059, + "learning_rate": 5.847373637264618e-07, + "loss": 0.8691, + "step": 1020 + }, + { + "epoch": 1.8181818181818183, + "eval_loss": 0.9938483238220215, + "eval_runtime": 25.6316, + "eval_samples_per_second": 39.014, + "eval_steps_per_second": 2.458, + "step": 1020 + }, + { + "epoch": 1.819964349376114, + "grad_norm": 1.638862133026123, + "learning_rate": 5.797819623389495e-07, + "loss": 0.8828, + "step": 1021 + }, + { + "epoch": 1.82174688057041, + "grad_norm": 1.7828065156936646, + "learning_rate": 5.748265609514372e-07, + "loss": 0.9372, + "step": 1022 + }, + { + "epoch": 1.8235294117647058, + "grad_norm": 1.6707323789596558, + "learning_rate": 5.698711595639247e-07, + "loss": 0.8912, + "step": 1023 + }, + { + "epoch": 1.8253119429590017, + "grad_norm": 1.707122564315796, + "learning_rate": 5.649157581764124e-07, + "loss": 0.9071, + "step": 1024 + }, + { + "epoch": 1.8270944741532977, + "grad_norm": 1.7146416902542114, + "learning_rate": 5.599603567888999e-07, + "loss": 0.914, + "step": 1025 + }, + { + "epoch": 1.8288770053475936, + "grad_norm": 1.7814098596572876, + "learning_rate": 5.550049554013876e-07, + "loss": 0.9267, + "step": 1026 + }, + { + "epoch": 1.8306595365418894, + "grad_norm": 1.7249929904937744, + "learning_rate": 5.500495540138751e-07, + "loss": 0.9132, + "step": 1027 + }, + { + "epoch": 1.8324420677361855, + "grad_norm": 1.764886498451233, + "learning_rate": 5.450941526263628e-07, + "loss": 0.908, + "step": 1028 + }, + { + "epoch": 1.8342245989304813, + "grad_norm": 1.780199408531189, + "learning_rate": 5.401387512388505e-07, + "loss": 0.8821, + "step": 1029 + }, + { + "epoch": 1.8360071301247771, + "grad_norm": 1.6997710466384888, + "learning_rate": 5.35183349851338e-07, + "loss": 0.8834, + "step": 1030 + }, + { + "epoch": 1.8360071301247771, + "eval_loss": 0.9937179088592529, + "eval_runtime": 25.8881, + "eval_samples_per_second": 38.628, + "eval_steps_per_second": 2.434, + "step": 1030 + }, + { + "epoch": 1.8377896613190732, + "grad_norm": 1.8370707035064697, + "learning_rate": 5.302279484638257e-07, + "loss": 0.923, + "step": 1031 + }, + { + "epoch": 1.839572192513369, + "grad_norm": 1.6870957612991333, + "learning_rate": 5.252725470763132e-07, + "loss": 0.9032, + "step": 1032 + }, + { + "epoch": 1.8413547237076648, + "grad_norm": 1.792143702507019, + "learning_rate": 5.203171456888009e-07, + "loss": 0.893, + "step": 1033 + }, + { + "epoch": 1.843137254901961, + "grad_norm": 1.7480674982070923, + "learning_rate": 5.153617443012885e-07, + "loss": 0.9187, + "step": 1034 + }, + { + "epoch": 1.8449197860962567, + "grad_norm": 1.7090226411819458, + "learning_rate": 5.104063429137761e-07, + "loss": 0.893, + "step": 1035 + }, + { + "epoch": 1.8467023172905526, + "grad_norm": 1.6919946670532227, + "learning_rate": 5.054509415262637e-07, + "loss": 0.9254, + "step": 1036 + }, + { + "epoch": 1.8484848484848486, + "grad_norm": 1.7311183214187622, + "learning_rate": 5.004955401387513e-07, + "loss": 0.8938, + "step": 1037 + }, + { + "epoch": 1.8502673796791442, + "grad_norm": 1.6775836944580078, + "learning_rate": 4.955401387512389e-07, + "loss": 0.935, + "step": 1038 + }, + { + "epoch": 1.8520499108734403, + "grad_norm": 1.7318252325057983, + "learning_rate": 4.905847373637266e-07, + "loss": 0.8997, + "step": 1039 + }, + { + "epoch": 1.8538324420677363, + "grad_norm": 1.7152644395828247, + "learning_rate": 4.856293359762141e-07, + "loss": 0.9001, + "step": 1040 + }, + { + "epoch": 1.8538324420677363, + "eval_loss": 0.992533802986145, + "eval_runtime": 25.9005, + "eval_samples_per_second": 38.609, + "eval_steps_per_second": 2.432, + "step": 1040 + }, + { + "epoch": 1.855614973262032, + "grad_norm": 1.613964557647705, + "learning_rate": 4.806739345887018e-07, + "loss": 0.8808, + "step": 1041 + }, + { + "epoch": 1.857397504456328, + "grad_norm": 1.6681315898895264, + "learning_rate": 4.7571853320118933e-07, + "loss": 0.8872, + "step": 1042 + }, + { + "epoch": 1.8591800356506238, + "grad_norm": 1.867087483406067, + "learning_rate": 4.70763131813677e-07, + "loss": 0.9167, + "step": 1043 + }, + { + "epoch": 1.8609625668449197, + "grad_norm": 1.7233860492706299, + "learning_rate": 4.6580773042616455e-07, + "loss": 0.9293, + "step": 1044 + }, + { + "epoch": 1.8627450980392157, + "grad_norm": 1.6797863245010376, + "learning_rate": 4.608523290386522e-07, + "loss": 0.8912, + "step": 1045 + }, + { + "epoch": 1.8645276292335116, + "grad_norm": 1.761204719543457, + "learning_rate": 4.5589692765113977e-07, + "loss": 0.8773, + "step": 1046 + }, + { + "epoch": 1.8663101604278074, + "grad_norm": 1.6859080791473389, + "learning_rate": 4.5094152626362743e-07, + "loss": 0.9091, + "step": 1047 + }, + { + "epoch": 1.8680926916221035, + "grad_norm": 1.7600855827331543, + "learning_rate": 4.45986124876115e-07, + "loss": 0.9268, + "step": 1048 + }, + { + "epoch": 1.8698752228163993, + "grad_norm": 1.656213641166687, + "learning_rate": 4.4103072348860265e-07, + "loss": 0.9073, + "step": 1049 + }, + { + "epoch": 1.8716577540106951, + "grad_norm": 1.7546846866607666, + "learning_rate": 4.360753221010902e-07, + "loss": 0.9244, + "step": 1050 + }, + { + "epoch": 1.8716577540106951, + "eval_loss": 0.99254310131073, + "eval_runtime": 25.8491, + "eval_samples_per_second": 38.686, + "eval_steps_per_second": 2.437, + "step": 1050 + }, + { + "epoch": 1.8734402852049912, + "grad_norm": 1.7467200756072998, + "learning_rate": 4.3111992071357786e-07, + "loss": 0.9226, + "step": 1051 + }, + { + "epoch": 1.875222816399287, + "grad_norm": 1.6825129985809326, + "learning_rate": 4.261645193260654e-07, + "loss": 0.9166, + "step": 1052 + }, + { + "epoch": 1.8770053475935828, + "grad_norm": 1.7524358034133911, + "learning_rate": 4.212091179385531e-07, + "loss": 0.8991, + "step": 1053 + }, + { + "epoch": 1.878787878787879, + "grad_norm": 1.770285964012146, + "learning_rate": 4.1625371655104064e-07, + "loss": 0.9491, + "step": 1054 + }, + { + "epoch": 1.8805704099821747, + "grad_norm": 1.684815526008606, + "learning_rate": 4.112983151635283e-07, + "loss": 0.911, + "step": 1055 + }, + { + "epoch": 1.8823529411764706, + "grad_norm": 1.7561120986938477, + "learning_rate": 4.063429137760159e-07, + "loss": 0.9183, + "step": 1056 + }, + { + "epoch": 1.8841354723707666, + "grad_norm": 1.6486444473266602, + "learning_rate": 4.013875123885035e-07, + "loss": 0.877, + "step": 1057 + }, + { + "epoch": 1.8859180035650622, + "grad_norm": 1.6989293098449707, + "learning_rate": 3.964321110009911e-07, + "loss": 0.9141, + "step": 1058 + }, + { + "epoch": 1.8877005347593583, + "grad_norm": 1.6929652690887451, + "learning_rate": 3.9147670961347873e-07, + "loss": 0.8975, + "step": 1059 + }, + { + "epoch": 1.8894830659536543, + "grad_norm": 1.671325445175171, + "learning_rate": 3.8652130822596634e-07, + "loss": 0.9029, + "step": 1060 + }, + { + "epoch": 1.8894830659536543, + "eval_loss": 0.9915664792060852, + "eval_runtime": 25.7783, + "eval_samples_per_second": 38.792, + "eval_steps_per_second": 2.444, + "step": 1060 + }, + { + "epoch": 1.89126559714795, + "grad_norm": 1.6696364879608154, + "learning_rate": 3.8156590683845395e-07, + "loss": 0.9056, + "step": 1061 + }, + { + "epoch": 1.893048128342246, + "grad_norm": 1.676077961921692, + "learning_rate": 3.7661050545094156e-07, + "loss": 0.9, + "step": 1062 + }, + { + "epoch": 1.8948306595365418, + "grad_norm": 1.6590983867645264, + "learning_rate": 3.7165510406342917e-07, + "loss": 0.9123, + "step": 1063 + }, + { + "epoch": 1.8966131907308377, + "grad_norm": 1.6489439010620117, + "learning_rate": 3.666997026759168e-07, + "loss": 0.8964, + "step": 1064 + }, + { + "epoch": 1.8983957219251337, + "grad_norm": 1.6285649538040161, + "learning_rate": 3.6174430128840444e-07, + "loss": 0.8824, + "step": 1065 + }, + { + "epoch": 1.9001782531194296, + "grad_norm": 1.6999493837356567, + "learning_rate": 3.56788899900892e-07, + "loss": 0.9206, + "step": 1066 + }, + { + "epoch": 1.9019607843137254, + "grad_norm": 1.7763421535491943, + "learning_rate": 3.5183349851337966e-07, + "loss": 0.9161, + "step": 1067 + }, + { + "epoch": 1.9037433155080214, + "grad_norm": 1.6540220975875854, + "learning_rate": 3.468780971258672e-07, + "loss": 0.9353, + "step": 1068 + }, + { + "epoch": 1.9055258467023173, + "grad_norm": 1.6708558797836304, + "learning_rate": 3.419226957383549e-07, + "loss": 0.88, + "step": 1069 + }, + { + "epoch": 1.9073083778966131, + "grad_norm": 1.6973531246185303, + "learning_rate": 3.3696729435084243e-07, + "loss": 0.9, + "step": 1070 + }, + { + "epoch": 1.9073083778966131, + "eval_loss": 0.9920867085456848, + "eval_runtime": 25.9739, + "eval_samples_per_second": 38.5, + "eval_steps_per_second": 2.426, + "step": 1070 + }, + { + "epoch": 1.9090909090909092, + "grad_norm": 1.6297328472137451, + "learning_rate": 3.320118929633301e-07, + "loss": 0.9112, + "step": 1071 + }, + { + "epoch": 1.910873440285205, + "grad_norm": 1.69383704662323, + "learning_rate": 3.2705649157581765e-07, + "loss": 0.9088, + "step": 1072 + }, + { + "epoch": 1.9126559714795008, + "grad_norm": 1.6893271207809448, + "learning_rate": 3.221010901883053e-07, + "loss": 0.861, + "step": 1073 + }, + { + "epoch": 1.914438502673797, + "grad_norm": 1.786592721939087, + "learning_rate": 3.1714568880079287e-07, + "loss": 0.9026, + "step": 1074 + }, + { + "epoch": 1.9162210338680927, + "grad_norm": 1.8169891834259033, + "learning_rate": 3.121902874132805e-07, + "loss": 0.9249, + "step": 1075 + }, + { + "epoch": 1.9180035650623886, + "grad_norm": 1.6998229026794434, + "learning_rate": 3.072348860257681e-07, + "loss": 0.9056, + "step": 1076 + }, + { + "epoch": 1.9197860962566846, + "grad_norm": 1.7679296731948853, + "learning_rate": 3.022794846382557e-07, + "loss": 0.9003, + "step": 1077 + }, + { + "epoch": 1.9215686274509802, + "grad_norm": 1.663244605064392, + "learning_rate": 2.973240832507433e-07, + "loss": 0.9093, + "step": 1078 + }, + { + "epoch": 1.9233511586452763, + "grad_norm": 1.7235232591629028, + "learning_rate": 2.923686818632309e-07, + "loss": 0.8856, + "step": 1079 + }, + { + "epoch": 1.9251336898395723, + "grad_norm": 1.6739834547042847, + "learning_rate": 2.874132804757186e-07, + "loss": 0.9137, + "step": 1080 + }, + { + "epoch": 1.9251336898395723, + "eval_loss": 0.9913634061813354, + "eval_runtime": 25.9488, + "eval_samples_per_second": 38.537, + "eval_steps_per_second": 2.428, + "step": 1080 + }, + { + "epoch": 1.926916221033868, + "grad_norm": 1.6595032215118408, + "learning_rate": 2.824578790882062e-07, + "loss": 0.8928, + "step": 1081 + }, + { + "epoch": 1.928698752228164, + "grad_norm": 1.7184573411941528, + "learning_rate": 2.775024777006938e-07, + "loss": 0.9226, + "step": 1082 + }, + { + "epoch": 1.93048128342246, + "grad_norm": 1.7487850189208984, + "learning_rate": 2.725470763131814e-07, + "loss": 0.9335, + "step": 1083 + }, + { + "epoch": 1.9322638146167557, + "grad_norm": 1.6264859437942505, + "learning_rate": 2.67591674925669e-07, + "loss": 0.9092, + "step": 1084 + }, + { + "epoch": 1.9340463458110517, + "grad_norm": 1.784679889678955, + "learning_rate": 2.626362735381566e-07, + "loss": 0.9258, + "step": 1085 + }, + { + "epoch": 1.9358288770053476, + "grad_norm": 1.645410180091858, + "learning_rate": 2.5768087215064423e-07, + "loss": 0.918, + "step": 1086 + }, + { + "epoch": 1.9376114081996434, + "grad_norm": 1.7359968423843384, + "learning_rate": 2.5272547076313184e-07, + "loss": 0.9283, + "step": 1087 + }, + { + "epoch": 1.9393939393939394, + "grad_norm": 1.7158929109573364, + "learning_rate": 2.4777006937561945e-07, + "loss": 0.8898, + "step": 1088 + }, + { + "epoch": 1.9411764705882353, + "grad_norm": 1.7459858655929565, + "learning_rate": 2.4281466798810706e-07, + "loss": 0.9302, + "step": 1089 + }, + { + "epoch": 1.9429590017825311, + "grad_norm": 1.7283644676208496, + "learning_rate": 2.3785926660059467e-07, + "loss": 0.9021, + "step": 1090 + }, + { + "epoch": 1.9429590017825311, + "eval_loss": 0.9908037781715393, + "eval_runtime": 25.9896, + "eval_samples_per_second": 38.477, + "eval_steps_per_second": 2.424, + "step": 1090 + }, + { + "epoch": 1.9447415329768272, + "grad_norm": 1.673015832901001, + "learning_rate": 2.3290386521308227e-07, + "loss": 0.8822, + "step": 1091 + }, + { + "epoch": 1.946524064171123, + "grad_norm": 1.6898096799850464, + "learning_rate": 2.2794846382556988e-07, + "loss": 0.8827, + "step": 1092 + }, + { + "epoch": 1.9483065953654188, + "grad_norm": 1.6672441959381104, + "learning_rate": 2.229930624380575e-07, + "loss": 0.9085, + "step": 1093 + }, + { + "epoch": 1.950089126559715, + "grad_norm": 1.6861313581466675, + "learning_rate": 2.180376610505451e-07, + "loss": 0.9116, + "step": 1094 + }, + { + "epoch": 1.9518716577540107, + "grad_norm": 1.7200956344604492, + "learning_rate": 2.130822596630327e-07, + "loss": 0.9095, + "step": 1095 + }, + { + "epoch": 1.9536541889483066, + "grad_norm": 1.6788264513015747, + "learning_rate": 2.0812685827552032e-07, + "loss": 0.9123, + "step": 1096 + }, + { + "epoch": 1.9554367201426026, + "grad_norm": 1.596208095550537, + "learning_rate": 2.0317145688800795e-07, + "loss": 0.8672, + "step": 1097 + }, + { + "epoch": 1.9572192513368984, + "grad_norm": 1.7107036113739014, + "learning_rate": 1.9821605550049556e-07, + "loss": 0.8914, + "step": 1098 + }, + { + "epoch": 1.9590017825311943, + "grad_norm": 1.7238177061080933, + "learning_rate": 1.9326065411298317e-07, + "loss": 0.9258, + "step": 1099 + }, + { + "epoch": 1.9607843137254903, + "grad_norm": 1.8259961605072021, + "learning_rate": 1.8830525272547078e-07, + "loss": 0.9138, + "step": 1100 + }, + { + "epoch": 1.9607843137254903, + "eval_loss": 0.99046790599823, + "eval_runtime": 25.9327, + "eval_samples_per_second": 38.561, + "eval_steps_per_second": 2.429, + "step": 1100 + }, + { + "epoch": 1.962566844919786, + "grad_norm": 1.7623475790023804, + "learning_rate": 1.833498513379584e-07, + "loss": 0.9137, + "step": 1101 + }, + { + "epoch": 1.964349376114082, + "grad_norm": 1.6571979522705078, + "learning_rate": 1.78394449950446e-07, + "loss": 0.8816, + "step": 1102 + }, + { + "epoch": 1.966131907308378, + "grad_norm": 1.8103545904159546, + "learning_rate": 1.734390485629336e-07, + "loss": 0.9206, + "step": 1103 + }, + { + "epoch": 1.9679144385026737, + "grad_norm": 1.7835015058517456, + "learning_rate": 1.6848364717542122e-07, + "loss": 0.9153, + "step": 1104 + }, + { + "epoch": 1.9696969696969697, + "grad_norm": 1.7936205863952637, + "learning_rate": 1.6352824578790883e-07, + "loss": 0.9228, + "step": 1105 + }, + { + "epoch": 1.9714795008912656, + "grad_norm": 1.6824809312820435, + "learning_rate": 1.5857284440039643e-07, + "loss": 0.943, + "step": 1106 + }, + { + "epoch": 1.9732620320855614, + "grad_norm": 1.6871892213821411, + "learning_rate": 1.5361744301288404e-07, + "loss": 0.8884, + "step": 1107 + }, + { + "epoch": 1.9750445632798574, + "grad_norm": 1.6781103610992432, + "learning_rate": 1.4866204162537165e-07, + "loss": 0.9194, + "step": 1108 + }, + { + "epoch": 1.9768270944741533, + "grad_norm": 1.7025740146636963, + "learning_rate": 1.437066402378593e-07, + "loss": 0.8969, + "step": 1109 + }, + { + "epoch": 1.9786096256684491, + "grad_norm": 1.637919306755066, + "learning_rate": 1.387512388503469e-07, + "loss": 0.8786, + "step": 1110 + }, + { + "epoch": 1.9786096256684491, + "eval_loss": 0.9895098209381104, + "eval_runtime": 25.8128, + "eval_samples_per_second": 38.741, + "eval_steps_per_second": 2.441, + "step": 1110 + }, + { + "epoch": 1.9803921568627452, + "grad_norm": 1.6665587425231934, + "learning_rate": 1.337958374628345e-07, + "loss": 0.8595, + "step": 1111 + }, + { + "epoch": 1.982174688057041, + "grad_norm": 1.6462030410766602, + "learning_rate": 1.2884043607532211e-07, + "loss": 0.9294, + "step": 1112 + }, + { + "epoch": 1.9839572192513368, + "grad_norm": 1.6541804075241089, + "learning_rate": 1.2388503468780972e-07, + "loss": 0.9129, + "step": 1113 + }, + { + "epoch": 1.985739750445633, + "grad_norm": 1.692040205001831, + "learning_rate": 1.1892963330029733e-07, + "loss": 0.935, + "step": 1114 + }, + { + "epoch": 1.9875222816399287, + "grad_norm": 1.64594566822052, + "learning_rate": 1.1397423191278494e-07, + "loss": 0.8774, + "step": 1115 + }, + { + "epoch": 1.9893048128342246, + "grad_norm": 1.645636796951294, + "learning_rate": 1.0901883052527255e-07, + "loss": 0.9163, + "step": 1116 + }, + { + "epoch": 1.9910873440285206, + "grad_norm": 1.6432050466537476, + "learning_rate": 1.0406342913776016e-07, + "loss": 0.8675, + "step": 1117 + }, + { + "epoch": 1.9928698752228164, + "grad_norm": 1.7587097883224487, + "learning_rate": 9.910802775024778e-08, + "loss": 0.9541, + "step": 1118 + }, + { + "epoch": 1.9946524064171123, + "grad_norm": 1.705899715423584, + "learning_rate": 9.415262636273539e-08, + "loss": 0.9088, + "step": 1119 + }, + { + "epoch": 1.9964349376114083, + "grad_norm": 1.652106761932373, + "learning_rate": 8.9197224975223e-08, + "loss": 0.8887, + "step": 1120 + }, + { + "epoch": 1.9964349376114083, + "eval_loss": 0.9894319772720337, + "eval_runtime": 25.8963, + "eval_samples_per_second": 38.615, + "eval_steps_per_second": 2.433, + "step": 1120 + }, + { + "epoch": 1.998217468805704, + "grad_norm": 1.6611700057983398, + "learning_rate": 8.424182358771061e-08, + "loss": 0.9036, + "step": 1121 + }, + { + "epoch": 2.0, + "grad_norm": 1.6227155923843384, + "learning_rate": 7.928642220019822e-08, + "loss": 0.8938, + "step": 1122 + } + ], + "logging_steps": 1, + "max_steps": 1122, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.590741983802163e+17, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +}