diff --git "a/checkpoints/Qwen2.5-7B/babylm_hop_words4_10M_seed0/runs/checkpoint-1122/trainer_state.json" "b/checkpoints/Qwen2.5-7B/babylm_hop_words4_10M_seed0/runs/checkpoint-1122/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoints/Qwen2.5-7B/babylm_hop_words4_10M_seed0/runs/checkpoint-1122/trainer_state.json" @@ -0,0 +1,8783 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 10, + "global_step": 1122, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0017825311942959, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.683, + "step": 1 + }, + { + "epoch": 0.0035650623885918, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6428, + "step": 2 + }, + { + "epoch": 0.0053475935828877, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6195, + "step": 3 + }, + { + "epoch": 0.0071301247771836, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6425, + "step": 4 + }, + { + "epoch": 0.008912655971479501, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6463, + "step": 5 + }, + { + "epoch": 0.0106951871657754, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6414, + "step": 6 + }, + { + "epoch": 0.012477718360071301, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6741, + "step": 7 + }, + { + "epoch": 0.0142602495543672, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6424, + "step": 8 + }, + { + "epoch": 0.016042780748663103, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6623, + "step": 9 + }, + { + "epoch": 0.017825311942959002, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6533, + "step": 10 + }, + { + "epoch": 0.017825311942959002, + "eval_loss": 1.6557202339172363, + "eval_runtime": 43.8817, + "eval_samples_per_second": 22.789, + "eval_steps_per_second": 1.436, + "step": 10 + }, + { + "epoch": 0.0196078431372549, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6458, + "step": 11 + }, + { + "epoch": 0.0213903743315508, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6579, + "step": 12 + }, + { + "epoch": 0.023172905525846704, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6622, + "step": 13 + }, + { + "epoch": 0.024955436720142603, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6817, + "step": 14 + }, + { + "epoch": 0.026737967914438502, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6487, + "step": 15 + }, + { + "epoch": 0.0285204991087344, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6536, + "step": 16 + }, + { + "epoch": 0.030303030303030304, + "grad_norm": 3.260859966278076, + "learning_rate": 4.424778761061947e-08, + "loss": 1.6625, + "step": 17 + }, + { + "epoch": 0.03208556149732621, + "grad_norm": 3.260859966278076, + "learning_rate": 4.424778761061947e-08, + "loss": 1.6537, + "step": 18 + }, + { + "epoch": 0.0338680926916221, + "grad_norm": 2.555511713027954, + "learning_rate": 8.849557522123894e-08, + "loss": 1.6453, + "step": 19 + }, + { + "epoch": 0.035650623885918005, + "grad_norm": 4.205122947692871, + "learning_rate": 1.327433628318584e-07, + "loss": 1.6643, + "step": 20 + }, + { + "epoch": 0.035650623885918005, + "eval_loss": 1.6556264162063599, + "eval_runtime": 43.7614, + "eval_samples_per_second": 22.851, + "eval_steps_per_second": 1.44, + "step": 20 + }, + { + "epoch": 0.0374331550802139, + "grad_norm": 3.736807346343994, + "learning_rate": 1.7699115044247788e-07, + "loss": 1.6559, + "step": 21 + }, + { + "epoch": 0.0392156862745098, + "grad_norm": 3.6770436763763428, + "learning_rate": 2.2123893805309737e-07, + "loss": 1.6467, + "step": 22 + }, + { + "epoch": 0.040998217468805706, + "grad_norm": 3.84171986579895, + "learning_rate": 2.654867256637168e-07, + "loss": 1.6601, + "step": 23 + }, + { + "epoch": 0.0427807486631016, + "grad_norm": 3.9061005115509033, + "learning_rate": 3.097345132743363e-07, + "loss": 1.6531, + "step": 24 + }, + { + "epoch": 0.044563279857397504, + "grad_norm": 3.594496726989746, + "learning_rate": 3.5398230088495575e-07, + "loss": 1.6616, + "step": 25 + }, + { + "epoch": 0.04634581105169341, + "grad_norm": 3.2176733016967773, + "learning_rate": 3.9823008849557525e-07, + "loss": 1.6419, + "step": 26 + }, + { + "epoch": 0.0481283422459893, + "grad_norm": 3.9928395748138428, + "learning_rate": 4.4247787610619474e-07, + "loss": 1.6579, + "step": 27 + }, + { + "epoch": 0.049910873440285206, + "grad_norm": 3.372333288192749, + "learning_rate": 4.867256637168142e-07, + "loss": 1.609, + "step": 28 + }, + { + "epoch": 0.05169340463458111, + "grad_norm": 4.2034831047058105, + "learning_rate": 5.309734513274336e-07, + "loss": 1.6619, + "step": 29 + }, + { + "epoch": 0.053475935828877004, + "grad_norm": 4.264431953430176, + "learning_rate": 5.752212389380532e-07, + "loss": 1.6542, + "step": 30 + }, + { + "epoch": 0.053475935828877004, + "eval_loss": 1.6338881254196167, + "eval_runtime": 45.0967, + "eval_samples_per_second": 22.175, + "eval_steps_per_second": 1.397, + "step": 30 + }, + { + "epoch": 0.05525846702317291, + "grad_norm": 3.44136381149292, + "learning_rate": 6.194690265486726e-07, + "loss": 1.6431, + "step": 31 + }, + { + "epoch": 0.0570409982174688, + "grad_norm": 1.7967804670333862, + "learning_rate": 6.637168141592922e-07, + "loss": 1.6195, + "step": 32 + }, + { + "epoch": 0.058823529411764705, + "grad_norm": 2.048090696334839, + "learning_rate": 7.079646017699115e-07, + "loss": 1.5952, + "step": 33 + }, + { + "epoch": 0.06060606060606061, + "grad_norm": 2.1011312007904053, + "learning_rate": 7.522123893805311e-07, + "loss": 1.6177, + "step": 34 + }, + { + "epoch": 0.062388591800356503, + "grad_norm": 2.0587246417999268, + "learning_rate": 7.964601769911505e-07, + "loss": 1.5957, + "step": 35 + }, + { + "epoch": 0.06417112299465241, + "grad_norm": 2.3372628688812256, + "learning_rate": 8.4070796460177e-07, + "loss": 1.6202, + "step": 36 + }, + { + "epoch": 0.0659536541889483, + "grad_norm": 2.5044116973876953, + "learning_rate": 8.849557522123895e-07, + "loss": 1.5929, + "step": 37 + }, + { + "epoch": 0.0677361853832442, + "grad_norm": 2.064378261566162, + "learning_rate": 9.292035398230089e-07, + "loss": 1.5967, + "step": 38 + }, + { + "epoch": 0.06951871657754011, + "grad_norm": 2.2238266468048096, + "learning_rate": 9.734513274336284e-07, + "loss": 1.5837, + "step": 39 + }, + { + "epoch": 0.07130124777183601, + "grad_norm": 2.6334006786346436, + "learning_rate": 1.017699115044248e-06, + "loss": 1.6019, + "step": 40 + }, + { + "epoch": 0.07130124777183601, + "eval_loss": 1.5835978984832764, + "eval_runtime": 46.0453, + "eval_samples_per_second": 21.718, + "eval_steps_per_second": 1.368, + "step": 40 + }, + { + "epoch": 0.07308377896613191, + "grad_norm": 2.2472009658813477, + "learning_rate": 1.0619469026548673e-06, + "loss": 1.5946, + "step": 41 + }, + { + "epoch": 0.0748663101604278, + "grad_norm": 1.96908700466156, + "learning_rate": 1.106194690265487e-06, + "loss": 1.5945, + "step": 42 + }, + { + "epoch": 0.0766488413547237, + "grad_norm": 1.6488579511642456, + "learning_rate": 1.1504424778761064e-06, + "loss": 1.5721, + "step": 43 + }, + { + "epoch": 0.0784313725490196, + "grad_norm": 1.7816352844238281, + "learning_rate": 1.1946902654867258e-06, + "loss": 1.5722, + "step": 44 + }, + { + "epoch": 0.08021390374331551, + "grad_norm": 2.124027967453003, + "learning_rate": 1.2389380530973452e-06, + "loss": 1.5593, + "step": 45 + }, + { + "epoch": 0.08199643493761141, + "grad_norm": 1.7037169933319092, + "learning_rate": 1.2831858407079647e-06, + "loss": 1.5581, + "step": 46 + }, + { + "epoch": 0.08377896613190731, + "grad_norm": 1.7509331703186035, + "learning_rate": 1.3274336283185843e-06, + "loss": 1.5216, + "step": 47 + }, + { + "epoch": 0.0855614973262032, + "grad_norm": 1.6668604612350464, + "learning_rate": 1.3716814159292036e-06, + "loss": 1.5189, + "step": 48 + }, + { + "epoch": 0.0873440285204991, + "grad_norm": 2.4238839149475098, + "learning_rate": 1.415929203539823e-06, + "loss": 1.5294, + "step": 49 + }, + { + "epoch": 0.08912655971479501, + "grad_norm": 2.1640570163726807, + "learning_rate": 1.4601769911504427e-06, + "loss": 1.5123, + "step": 50 + }, + { + "epoch": 0.08912655971479501, + "eval_loss": 1.4958887100219727, + "eval_runtime": 46.3715, + "eval_samples_per_second": 21.565, + "eval_steps_per_second": 1.359, + "step": 50 + }, + { + "epoch": 0.09090909090909091, + "grad_norm": 2.5261659622192383, + "learning_rate": 1.5044247787610621e-06, + "loss": 1.4612, + "step": 51 + }, + { + "epoch": 0.09269162210338681, + "grad_norm": 2.2561535835266113, + "learning_rate": 1.5486725663716816e-06, + "loss": 1.4658, + "step": 52 + }, + { + "epoch": 0.0944741532976827, + "grad_norm": 1.9507025480270386, + "learning_rate": 1.592920353982301e-06, + "loss": 1.4318, + "step": 53 + }, + { + "epoch": 0.0962566844919786, + "grad_norm": 1.9108023643493652, + "learning_rate": 1.6371681415929204e-06, + "loss": 1.4402, + "step": 54 + }, + { + "epoch": 0.09803921568627451, + "grad_norm": 1.9863903522491455, + "learning_rate": 1.68141592920354e-06, + "loss": 1.4153, + "step": 55 + }, + { + "epoch": 0.09982174688057041, + "grad_norm": 1.8348286151885986, + "learning_rate": 1.7256637168141593e-06, + "loss": 1.3972, + "step": 56 + }, + { + "epoch": 0.10160427807486631, + "grad_norm": 1.7218098640441895, + "learning_rate": 1.769911504424779e-06, + "loss": 1.4074, + "step": 57 + }, + { + "epoch": 0.10338680926916222, + "grad_norm": 1.701070785522461, + "learning_rate": 1.8141592920353984e-06, + "loss": 1.3963, + "step": 58 + }, + { + "epoch": 0.1051693404634581, + "grad_norm": 1.723047137260437, + "learning_rate": 1.8584070796460179e-06, + "loss": 1.3701, + "step": 59 + }, + { + "epoch": 0.10695187165775401, + "grad_norm": 1.7656834125518799, + "learning_rate": 1.9026548672566373e-06, + "loss": 1.3684, + "step": 60 + }, + { + "epoch": 0.10695187165775401, + "eval_loss": 1.372704267501831, + "eval_runtime": 46.4402, + "eval_samples_per_second": 21.533, + "eval_steps_per_second": 1.357, + "step": 60 + }, + { + "epoch": 0.10873440285204991, + "grad_norm": 1.3581100702285767, + "learning_rate": 1.9469026548672567e-06, + "loss": 1.335, + "step": 61 + }, + { + "epoch": 0.11051693404634581, + "grad_norm": 1.767719030380249, + "learning_rate": 1.991150442477876e-06, + "loss": 1.3582, + "step": 62 + }, + { + "epoch": 0.11229946524064172, + "grad_norm": 2.281069278717041, + "learning_rate": 2.035398230088496e-06, + "loss": 1.3439, + "step": 63 + }, + { + "epoch": 0.1140819964349376, + "grad_norm": 1.6699604988098145, + "learning_rate": 2.079646017699115e-06, + "loss": 1.3473, + "step": 64 + }, + { + "epoch": 0.11586452762923351, + "grad_norm": 2.3921921253204346, + "learning_rate": 2.1238938053097345e-06, + "loss": 1.3396, + "step": 65 + }, + { + "epoch": 0.11764705882352941, + "grad_norm": 2.202098846435547, + "learning_rate": 2.1681415929203544e-06, + "loss": 1.301, + "step": 66 + }, + { + "epoch": 0.11942959001782531, + "grad_norm": 1.4074530601501465, + "learning_rate": 2.212389380530974e-06, + "loss": 1.2906, + "step": 67 + }, + { + "epoch": 0.12121212121212122, + "grad_norm": 2.1387805938720703, + "learning_rate": 2.256637168141593e-06, + "loss": 1.3198, + "step": 68 + }, + { + "epoch": 0.12299465240641712, + "grad_norm": 1.847856044769287, + "learning_rate": 2.3008849557522127e-06, + "loss": 1.2936, + "step": 69 + }, + { + "epoch": 0.12477718360071301, + "grad_norm": 1.4546856880187988, + "learning_rate": 2.345132743362832e-06, + "loss": 1.2913, + "step": 70 + }, + { + "epoch": 0.12477718360071301, + "eval_loss": 1.3064121007919312, + "eval_runtime": 46.4122, + "eval_samples_per_second": 21.546, + "eval_steps_per_second": 1.357, + "step": 70 + }, + { + "epoch": 0.1265597147950089, + "grad_norm": 2.0278193950653076, + "learning_rate": 2.3893805309734516e-06, + "loss": 1.2712, + "step": 71 + }, + { + "epoch": 0.12834224598930483, + "grad_norm": 1.7863024473190308, + "learning_rate": 2.433628318584071e-06, + "loss": 1.2981, + "step": 72 + }, + { + "epoch": 0.13012477718360071, + "grad_norm": 1.429937720298767, + "learning_rate": 2.4778761061946905e-06, + "loss": 1.2857, + "step": 73 + }, + { + "epoch": 0.1319073083778966, + "grad_norm": 1.7546721696853638, + "learning_rate": 2.52212389380531e-06, + "loss": 1.2608, + "step": 74 + }, + { + "epoch": 0.13368983957219252, + "grad_norm": 2.067112445831299, + "learning_rate": 2.5663716814159294e-06, + "loss": 1.3058, + "step": 75 + }, + { + "epoch": 0.1354723707664884, + "grad_norm": 1.893581509590149, + "learning_rate": 2.6106194690265492e-06, + "loss": 1.2815, + "step": 76 + }, + { + "epoch": 0.13725490196078433, + "grad_norm": 2.0614898204803467, + "learning_rate": 2.6548672566371687e-06, + "loss": 1.2204, + "step": 77 + }, + { + "epoch": 0.13903743315508021, + "grad_norm": 1.8954098224639893, + "learning_rate": 2.6991150442477877e-06, + "loss": 1.2517, + "step": 78 + }, + { + "epoch": 0.1408199643493761, + "grad_norm": 1.7369916439056396, + "learning_rate": 2.743362831858407e-06, + "loss": 1.2706, + "step": 79 + }, + { + "epoch": 0.14260249554367202, + "grad_norm": 2.026573896408081, + "learning_rate": 2.7876106194690266e-06, + "loss": 1.2368, + "step": 80 + }, + { + "epoch": 0.14260249554367202, + "eval_loss": 1.2641066312789917, + "eval_runtime": 46.4069, + "eval_samples_per_second": 21.549, + "eval_steps_per_second": 1.358, + "step": 80 + }, + { + "epoch": 0.1443850267379679, + "grad_norm": 1.4173997640609741, + "learning_rate": 2.831858407079646e-06, + "loss": 1.2187, + "step": 81 + }, + { + "epoch": 0.14616755793226383, + "grad_norm": 1.9600279331207275, + "learning_rate": 2.876106194690266e-06, + "loss": 1.25, + "step": 82 + }, + { + "epoch": 0.14795008912655971, + "grad_norm": 2.2337169647216797, + "learning_rate": 2.9203539823008853e-06, + "loss": 1.2143, + "step": 83 + }, + { + "epoch": 0.1497326203208556, + "grad_norm": 1.4930493831634521, + "learning_rate": 2.9646017699115048e-06, + "loss": 1.2178, + "step": 84 + }, + { + "epoch": 0.15151515151515152, + "grad_norm": 1.950060248374939, + "learning_rate": 3.0088495575221242e-06, + "loss": 1.237, + "step": 85 + }, + { + "epoch": 0.1532976827094474, + "grad_norm": 1.7046960592269897, + "learning_rate": 3.0530973451327432e-06, + "loss": 1.1585, + "step": 86 + }, + { + "epoch": 0.15508021390374332, + "grad_norm": 1.6781989336013794, + "learning_rate": 3.097345132743363e-06, + "loss": 1.1892, + "step": 87 + }, + { + "epoch": 0.1568627450980392, + "grad_norm": 2.416377544403076, + "learning_rate": 3.1415929203539825e-06, + "loss": 1.2191, + "step": 88 + }, + { + "epoch": 0.1586452762923351, + "grad_norm": 1.7030267715454102, + "learning_rate": 3.185840707964602e-06, + "loss": 1.2403, + "step": 89 + }, + { + "epoch": 0.16042780748663102, + "grad_norm": 2.404744863510132, + "learning_rate": 3.2300884955752214e-06, + "loss": 1.2364, + "step": 90 + }, + { + "epoch": 0.16042780748663102, + "eval_loss": 1.2330397367477417, + "eval_runtime": 46.5752, + "eval_samples_per_second": 21.471, + "eval_steps_per_second": 1.353, + "step": 90 + }, + { + "epoch": 0.1622103386809269, + "grad_norm": 1.3477970361709595, + "learning_rate": 3.274336283185841e-06, + "loss": 1.2123, + "step": 91 + }, + { + "epoch": 0.16399286987522282, + "grad_norm": 2.5285091400146484, + "learning_rate": 3.3185840707964607e-06, + "loss": 1.2276, + "step": 92 + }, + { + "epoch": 0.1657754010695187, + "grad_norm": 1.5784391164779663, + "learning_rate": 3.36283185840708e-06, + "loss": 1.2056, + "step": 93 + }, + { + "epoch": 0.16755793226381463, + "grad_norm": 1.4000625610351562, + "learning_rate": 3.407079646017699e-06, + "loss": 1.2324, + "step": 94 + }, + { + "epoch": 0.16934046345811052, + "grad_norm": 2.468836784362793, + "learning_rate": 3.4513274336283186e-06, + "loss": 1.2209, + "step": 95 + }, + { + "epoch": 0.1711229946524064, + "grad_norm": 1.5707480907440186, + "learning_rate": 3.495575221238938e-06, + "loss": 1.1717, + "step": 96 + }, + { + "epoch": 0.17290552584670232, + "grad_norm": 2.063491106033325, + "learning_rate": 3.539823008849558e-06, + "loss": 1.1874, + "step": 97 + }, + { + "epoch": 0.1746880570409982, + "grad_norm": 1.7270640134811401, + "learning_rate": 3.5840707964601774e-06, + "loss": 1.2179, + "step": 98 + }, + { + "epoch": 0.17647058823529413, + "grad_norm": 1.9771426916122437, + "learning_rate": 3.628318584070797e-06, + "loss": 1.1508, + "step": 99 + }, + { + "epoch": 0.17825311942959002, + "grad_norm": 2.1519346237182617, + "learning_rate": 3.6725663716814163e-06, + "loss": 1.1956, + "step": 100 + }, + { + "epoch": 0.17825311942959002, + "eval_loss": 1.2065033912658691, + "eval_runtime": 46.5272, + "eval_samples_per_second": 21.493, + "eval_steps_per_second": 1.354, + "step": 100 + }, + { + "epoch": 0.1800356506238859, + "grad_norm": 1.335666298866272, + "learning_rate": 3.7168141592920357e-06, + "loss": 1.18, + "step": 101 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 2.2174997329711914, + "learning_rate": 3.7610619469026547e-06, + "loss": 1.164, + "step": 102 + }, + { + "epoch": 0.1836007130124777, + "grad_norm": 1.5584592819213867, + "learning_rate": 3.8053097345132746e-06, + "loss": 1.133, + "step": 103 + }, + { + "epoch": 0.18538324420677363, + "grad_norm": 2.222188949584961, + "learning_rate": 3.849557522123894e-06, + "loss": 1.1489, + "step": 104 + }, + { + "epoch": 0.18716577540106952, + "grad_norm": 1.8726121187210083, + "learning_rate": 3.8938053097345135e-06, + "loss": 1.1791, + "step": 105 + }, + { + "epoch": 0.1889483065953654, + "grad_norm": 1.8814388513565063, + "learning_rate": 3.938053097345133e-06, + "loss": 1.1545, + "step": 106 + }, + { + "epoch": 0.19073083778966132, + "grad_norm": 2.3826377391815186, + "learning_rate": 3.982300884955752e-06, + "loss": 1.1467, + "step": 107 + }, + { + "epoch": 0.1925133689839572, + "grad_norm": 1.398267388343811, + "learning_rate": 4.026548672566372e-06, + "loss": 1.1761, + "step": 108 + }, + { + "epoch": 0.19429590017825313, + "grad_norm": 1.3779840469360352, + "learning_rate": 4.070796460176992e-06, + "loss": 1.1323, + "step": 109 + }, + { + "epoch": 0.19607843137254902, + "grad_norm": 1.9406187534332275, + "learning_rate": 4.115044247787611e-06, + "loss": 1.1125, + "step": 110 + }, + { + "epoch": 0.19607843137254902, + "eval_loss": 1.1835510730743408, + "eval_runtime": 46.5879, + "eval_samples_per_second": 21.465, + "eval_steps_per_second": 1.352, + "step": 110 + }, + { + "epoch": 0.19786096256684493, + "grad_norm": 1.5905953645706177, + "learning_rate": 4.15929203539823e-06, + "loss": 1.1075, + "step": 111 + }, + { + "epoch": 0.19964349376114082, + "grad_norm": 1.9858345985412598, + "learning_rate": 4.20353982300885e-06, + "loss": 1.1772, + "step": 112 + }, + { + "epoch": 0.2014260249554367, + "grad_norm": 1.8801683187484741, + "learning_rate": 4.247787610619469e-06, + "loss": 1.1625, + "step": 113 + }, + { + "epoch": 0.20320855614973263, + "grad_norm": 1.507340431213379, + "learning_rate": 4.2920353982300885e-06, + "loss": 1.1491, + "step": 114 + }, + { + "epoch": 0.20499108734402852, + "grad_norm": 1.7214304208755493, + "learning_rate": 4.336283185840709e-06, + "loss": 1.1368, + "step": 115 + }, + { + "epoch": 0.20677361853832443, + "grad_norm": 1.971928358078003, + "learning_rate": 4.380530973451328e-06, + "loss": 1.1497, + "step": 116 + }, + { + "epoch": 0.20855614973262032, + "grad_norm": 1.5467543601989746, + "learning_rate": 4.424778761061948e-06, + "loss": 1.099, + "step": 117 + }, + { + "epoch": 0.2103386809269162, + "grad_norm": 1.7273290157318115, + "learning_rate": 4.469026548672566e-06, + "loss": 1.0918, + "step": 118 + }, + { + "epoch": 0.21212121212121213, + "grad_norm": 1.7540112733840942, + "learning_rate": 4.513274336283186e-06, + "loss": 1.1105, + "step": 119 + }, + { + "epoch": 0.21390374331550802, + "grad_norm": 1.4684147834777832, + "learning_rate": 4.557522123893805e-06, + "loss": 1.12, + "step": 120 + }, + { + "epoch": 0.21390374331550802, + "eval_loss": 1.165776014328003, + "eval_runtime": 46.5571, + "eval_samples_per_second": 21.479, + "eval_steps_per_second": 1.353, + "step": 120 + }, + { + "epoch": 0.21568627450980393, + "grad_norm": 1.7996920347213745, + "learning_rate": 4.6017699115044254e-06, + "loss": 1.1068, + "step": 121 + }, + { + "epoch": 0.21746880570409982, + "grad_norm": 2.2431159019470215, + "learning_rate": 4.646017699115045e-06, + "loss": 1.1161, + "step": 122 + }, + { + "epoch": 0.2192513368983957, + "grad_norm": 1.894515037536621, + "learning_rate": 4.690265486725664e-06, + "loss": 1.0878, + "step": 123 + }, + { + "epoch": 0.22103386809269163, + "grad_norm": 1.8979915380477905, + "learning_rate": 4.734513274336284e-06, + "loss": 1.1148, + "step": 124 + }, + { + "epoch": 0.22281639928698752, + "grad_norm": 2.508988618850708, + "learning_rate": 4.778761061946903e-06, + "loss": 1.1406, + "step": 125 + }, + { + "epoch": 0.22459893048128343, + "grad_norm": 1.8342119455337524, + "learning_rate": 4.823008849557523e-06, + "loss": 1.0711, + "step": 126 + }, + { + "epoch": 0.22638146167557932, + "grad_norm": 1.8743572235107422, + "learning_rate": 4.867256637168142e-06, + "loss": 1.1042, + "step": 127 + }, + { + "epoch": 0.2281639928698752, + "grad_norm": 2.5183823108673096, + "learning_rate": 4.9115044247787615e-06, + "loss": 1.1277, + "step": 128 + }, + { + "epoch": 0.22994652406417113, + "grad_norm": 1.3967655897140503, + "learning_rate": 4.955752212389381e-06, + "loss": 1.0907, + "step": 129 + }, + { + "epoch": 0.23172905525846701, + "grad_norm": 2.2105870246887207, + "learning_rate": 5e-06, + "loss": 1.1028, + "step": 130 + }, + { + "epoch": 0.23172905525846701, + "eval_loss": 1.1516406536102295, + "eval_runtime": 46.5761, + "eval_samples_per_second": 21.47, + "eval_steps_per_second": 1.353, + "step": 130 + }, + { + "epoch": 0.23351158645276293, + "grad_norm": 2.0359160900115967, + "learning_rate": 4.995044598612488e-06, + "loss": 1.0557, + "step": 131 + }, + { + "epoch": 0.23529411764705882, + "grad_norm": 2.0358471870422363, + "learning_rate": 4.990089197224976e-06, + "loss": 1.0805, + "step": 132 + }, + { + "epoch": 0.23707664884135474, + "grad_norm": 2.4560561180114746, + "learning_rate": 4.985133795837464e-06, + "loss": 1.1219, + "step": 133 + }, + { + "epoch": 0.23885918003565063, + "grad_norm": 1.5883917808532715, + "learning_rate": 4.980178394449951e-06, + "loss": 1.1179, + "step": 134 + }, + { + "epoch": 0.24064171122994651, + "grad_norm": 2.193816661834717, + "learning_rate": 4.975222993062438e-06, + "loss": 1.1235, + "step": 135 + }, + { + "epoch": 0.24242424242424243, + "grad_norm": 1.2323447465896606, + "learning_rate": 4.970267591674926e-06, + "loss": 1.095, + "step": 136 + }, + { + "epoch": 0.24420677361853832, + "grad_norm": 2.667142391204834, + "learning_rate": 4.965312190287414e-06, + "loss": 1.1391, + "step": 137 + }, + { + "epoch": 0.24598930481283424, + "grad_norm": 1.4862529039382935, + "learning_rate": 4.960356788899901e-06, + "loss": 1.0967, + "step": 138 + }, + { + "epoch": 0.24777183600713013, + "grad_norm": 2.315704584121704, + "learning_rate": 4.955401387512389e-06, + "loss": 1.0874, + "step": 139 + }, + { + "epoch": 0.24955436720142601, + "grad_norm": 1.8777433633804321, + "learning_rate": 4.950445986124876e-06, + "loss": 1.0983, + "step": 140 + }, + { + "epoch": 0.24955436720142601, + "eval_loss": 1.1395500898361206, + "eval_runtime": 46.6881, + "eval_samples_per_second": 21.419, + "eval_steps_per_second": 1.349, + "step": 140 + }, + { + "epoch": 0.25133689839572193, + "grad_norm": 1.809224009513855, + "learning_rate": 4.945490584737364e-06, + "loss": 1.0752, + "step": 141 + }, + { + "epoch": 0.2531194295900178, + "grad_norm": 1.6294512748718262, + "learning_rate": 4.9405351833498515e-06, + "loss": 1.09, + "step": 142 + }, + { + "epoch": 0.2549019607843137, + "grad_norm": 2.189276933670044, + "learning_rate": 4.935579781962339e-06, + "loss": 1.0583, + "step": 143 + }, + { + "epoch": 0.25668449197860965, + "grad_norm": 1.9446332454681396, + "learning_rate": 4.930624380574827e-06, + "loss": 1.1093, + "step": 144 + }, + { + "epoch": 0.25846702317290554, + "grad_norm": 1.3689433336257935, + "learning_rate": 4.925668979187315e-06, + "loss": 1.0821, + "step": 145 + }, + { + "epoch": 0.26024955436720143, + "grad_norm": 1.5317623615264893, + "learning_rate": 4.920713577799802e-06, + "loss": 1.0563, + "step": 146 + }, + { + "epoch": 0.2620320855614973, + "grad_norm": 1.8125908374786377, + "learning_rate": 4.915758176412289e-06, + "loss": 1.0721, + "step": 147 + }, + { + "epoch": 0.2638146167557932, + "grad_norm": 1.3997691869735718, + "learning_rate": 4.9108027750247775e-06, + "loss": 1.0583, + "step": 148 + }, + { + "epoch": 0.26559714795008915, + "grad_norm": 2.459528684616089, + "learning_rate": 4.9058473736372656e-06, + "loss": 1.0657, + "step": 149 + }, + { + "epoch": 0.26737967914438504, + "grad_norm": 1.4545484781265259, + "learning_rate": 4.900891972249753e-06, + "loss": 1.0698, + "step": 150 + }, + { + "epoch": 0.26737967914438504, + "eval_loss": 1.1283138990402222, + "eval_runtime": 46.5528, + "eval_samples_per_second": 21.481, + "eval_steps_per_second": 1.353, + "step": 150 + }, + { + "epoch": 0.26916221033868093, + "grad_norm": 1.7834577560424805, + "learning_rate": 4.89593657086224e-06, + "loss": 1.051, + "step": 151 + }, + { + "epoch": 0.2709447415329768, + "grad_norm": 2.1426002979278564, + "learning_rate": 4.890981169474728e-06, + "loss": 1.0561, + "step": 152 + }, + { + "epoch": 0.2727272727272727, + "grad_norm": 1.8575809001922607, + "learning_rate": 4.886025768087215e-06, + "loss": 1.0573, + "step": 153 + }, + { + "epoch": 0.27450980392156865, + "grad_norm": 1.7787772417068481, + "learning_rate": 4.881070366699703e-06, + "loss": 1.0508, + "step": 154 + }, + { + "epoch": 0.27629233511586454, + "grad_norm": 1.6413190364837646, + "learning_rate": 4.876114965312191e-06, + "loss": 1.0829, + "step": 155 + }, + { + "epoch": 0.27807486631016043, + "grad_norm": 2.1804025173187256, + "learning_rate": 4.871159563924679e-06, + "loss": 1.0663, + "step": 156 + }, + { + "epoch": 0.2798573975044563, + "grad_norm": 2.081756353378296, + "learning_rate": 4.866204162537166e-06, + "loss": 1.0621, + "step": 157 + }, + { + "epoch": 0.2816399286987522, + "grad_norm": 1.5092004537582397, + "learning_rate": 4.861248761149653e-06, + "loss": 1.071, + "step": 158 + }, + { + "epoch": 0.28342245989304815, + "grad_norm": 2.1826558113098145, + "learning_rate": 4.8562933597621405e-06, + "loss": 1.0904, + "step": 159 + }, + { + "epoch": 0.28520499108734404, + "grad_norm": 1.9138243198394775, + "learning_rate": 4.8513379583746286e-06, + "loss": 1.0662, + "step": 160 + }, + { + "epoch": 0.28520499108734404, + "eval_loss": 1.1119412183761597, + "eval_runtime": 46.6301, + "eval_samples_per_second": 21.445, + "eval_steps_per_second": 1.351, + "step": 160 + }, + { + "epoch": 0.28698752228163993, + "grad_norm": 1.5497403144836426, + "learning_rate": 4.846382556987117e-06, + "loss": 1.0312, + "step": 161 + }, + { + "epoch": 0.2887700534759358, + "grad_norm": 1.6409493684768677, + "learning_rate": 4.841427155599604e-06, + "loss": 1.0779, + "step": 162 + }, + { + "epoch": 0.2905525846702317, + "grad_norm": 1.6504517793655396, + "learning_rate": 4.836471754212091e-06, + "loss": 1.0901, + "step": 163 + }, + { + "epoch": 0.29233511586452765, + "grad_norm": 1.433727741241455, + "learning_rate": 4.831516352824579e-06, + "loss": 1.0555, + "step": 164 + }, + { + "epoch": 0.29411764705882354, + "grad_norm": 1.4695453643798828, + "learning_rate": 4.826560951437067e-06, + "loss": 1.0421, + "step": 165 + }, + { + "epoch": 0.29590017825311943, + "grad_norm": 1.7415134906768799, + "learning_rate": 4.8216055500495545e-06, + "loss": 1.0147, + "step": 166 + }, + { + "epoch": 0.2976827094474153, + "grad_norm": 1.3279739618301392, + "learning_rate": 4.816650148662042e-06, + "loss": 1.0764, + "step": 167 + }, + { + "epoch": 0.2994652406417112, + "grad_norm": 1.5744524002075195, + "learning_rate": 4.81169474727453e-06, + "loss": 1.0412, + "step": 168 + }, + { + "epoch": 0.30124777183600715, + "grad_norm": 1.8190844058990479, + "learning_rate": 4.806739345887017e-06, + "loss": 1.0448, + "step": 169 + }, + { + "epoch": 0.30303030303030304, + "grad_norm": 1.7416859865188599, + "learning_rate": 4.801783944499504e-06, + "loss": 1.052, + "step": 170 + }, + { + "epoch": 0.30303030303030304, + "eval_loss": 1.1019716262817383, + "eval_runtime": 46.6122, + "eval_samples_per_second": 21.454, + "eval_steps_per_second": 1.352, + "step": 170 + }, + { + "epoch": 0.3048128342245989, + "grad_norm": 1.7649554014205933, + "learning_rate": 4.7968285431119924e-06, + "loss": 1.0654, + "step": 171 + }, + { + "epoch": 0.3065953654188948, + "grad_norm": 2.2370729446411133, + "learning_rate": 4.7918731417244805e-06, + "loss": 1.0519, + "step": 172 + }, + { + "epoch": 0.3083778966131907, + "grad_norm": 1.7433515787124634, + "learning_rate": 4.786917740336968e-06, + "loss": 1.0431, + "step": 173 + }, + { + "epoch": 0.31016042780748665, + "grad_norm": 1.4815385341644287, + "learning_rate": 4.781962338949455e-06, + "loss": 1.0614, + "step": 174 + }, + { + "epoch": 0.31194295900178254, + "grad_norm": 1.8037469387054443, + "learning_rate": 4.777006937561943e-06, + "loss": 1.0199, + "step": 175 + }, + { + "epoch": 0.3137254901960784, + "grad_norm": 1.5110318660736084, + "learning_rate": 4.77205153617443e-06, + "loss": 1.0676, + "step": 176 + }, + { + "epoch": 0.3155080213903743, + "grad_norm": 1.390753149986267, + "learning_rate": 4.767096134786918e-06, + "loss": 1.0451, + "step": 177 + }, + { + "epoch": 0.3172905525846702, + "grad_norm": 1.6916826963424683, + "learning_rate": 4.762140733399406e-06, + "loss": 1.041, + "step": 178 + }, + { + "epoch": 0.31907308377896615, + "grad_norm": 1.5903483629226685, + "learning_rate": 4.757185332011893e-06, + "loss": 1.0095, + "step": 179 + }, + { + "epoch": 0.32085561497326204, + "grad_norm": 1.4499986171722412, + "learning_rate": 4.752229930624381e-06, + "loss": 1.0617, + "step": 180 + }, + { + "epoch": 0.32085561497326204, + "eval_loss": 1.0959594249725342, + "eval_runtime": 46.7286, + "eval_samples_per_second": 21.4, + "eval_steps_per_second": 1.348, + "step": 180 + }, + { + "epoch": 0.3226381461675579, + "grad_norm": 1.4177703857421875, + "learning_rate": 4.747274529236869e-06, + "loss": 1.0189, + "step": 181 + }, + { + "epoch": 0.3244206773618538, + "grad_norm": 1.5359338521957397, + "learning_rate": 4.742319127849356e-06, + "loss": 1.0539, + "step": 182 + }, + { + "epoch": 0.32620320855614976, + "grad_norm": 1.726008415222168, + "learning_rate": 4.7373637264618435e-06, + "loss": 1.053, + "step": 183 + }, + { + "epoch": 0.32798573975044565, + "grad_norm": 2.085338830947876, + "learning_rate": 4.732408325074332e-06, + "loss": 1.0649, + "step": 184 + }, + { + "epoch": 0.32976827094474154, + "grad_norm": 2.3661019802093506, + "learning_rate": 4.727452923686819e-06, + "loss": 1.0542, + "step": 185 + }, + { + "epoch": 0.3315508021390374, + "grad_norm": 1.524652123451233, + "learning_rate": 4.722497522299306e-06, + "loss": 1.0562, + "step": 186 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 1.3639343976974487, + "learning_rate": 4.717542120911794e-06, + "loss": 1.0518, + "step": 187 + }, + { + "epoch": 0.33511586452762926, + "grad_norm": 1.6668117046356201, + "learning_rate": 4.712586719524282e-06, + "loss": 1.0644, + "step": 188 + }, + { + "epoch": 0.33689839572192515, + "grad_norm": 1.5209795236587524, + "learning_rate": 4.7076313181367695e-06, + "loss": 1.0203, + "step": 189 + }, + { + "epoch": 0.33868092691622104, + "grad_norm": 1.3939913511276245, + "learning_rate": 4.702675916749257e-06, + "loss": 1.0712, + "step": 190 + }, + { + "epoch": 0.33868092691622104, + "eval_loss": 1.0875341892242432, + "eval_runtime": 46.5406, + "eval_samples_per_second": 21.487, + "eval_steps_per_second": 1.354, + "step": 190 + }, + { + "epoch": 0.3404634581105169, + "grad_norm": 1.3856011629104614, + "learning_rate": 4.697720515361745e-06, + "loss": 1.0199, + "step": 191 + }, + { + "epoch": 0.3422459893048128, + "grad_norm": 1.608984112739563, + "learning_rate": 4.692765113974233e-06, + "loss": 1.0507, + "step": 192 + }, + { + "epoch": 0.34402852049910876, + "grad_norm": 1.7646530866622925, + "learning_rate": 4.68780971258672e-06, + "loss": 1.0043, + "step": 193 + }, + { + "epoch": 0.34581105169340465, + "grad_norm": 1.4916458129882812, + "learning_rate": 4.682854311199207e-06, + "loss": 1.0132, + "step": 194 + }, + { + "epoch": 0.34759358288770054, + "grad_norm": 1.4389764070510864, + "learning_rate": 4.677898909811695e-06, + "loss": 1.0088, + "step": 195 + }, + { + "epoch": 0.3493761140819964, + "grad_norm": 1.8099793195724487, + "learning_rate": 4.672943508424183e-06, + "loss": 1.0278, + "step": 196 + }, + { + "epoch": 0.3511586452762923, + "grad_norm": 1.6475704908370972, + "learning_rate": 4.667988107036671e-06, + "loss": 1.0169, + "step": 197 + }, + { + "epoch": 0.35294117647058826, + "grad_norm": 1.4079694747924805, + "learning_rate": 4.663032705649158e-06, + "loss": 1.0327, + "step": 198 + }, + { + "epoch": 0.35472370766488415, + "grad_norm": 1.91256844997406, + "learning_rate": 4.658077304261645e-06, + "loss": 1.0322, + "step": 199 + }, + { + "epoch": 0.35650623885918004, + "grad_norm": 1.4115188121795654, + "learning_rate": 4.653121902874133e-06, + "loss": 1.0105, + "step": 200 + }, + { + "epoch": 0.35650623885918004, + "eval_loss": 1.083202600479126, + "eval_runtime": 46.6309, + "eval_samples_per_second": 21.445, + "eval_steps_per_second": 1.351, + "step": 200 + }, + { + "epoch": 0.3582887700534759, + "grad_norm": 1.5581523180007935, + "learning_rate": 4.648166501486621e-06, + "loss": 1.0371, + "step": 201 + }, + { + "epoch": 0.3600713012477718, + "grad_norm": 2.1223926544189453, + "learning_rate": 4.643211100099108e-06, + "loss": 1.0471, + "step": 202 + }, + { + "epoch": 0.36185383244206776, + "grad_norm": 1.6544064283370972, + "learning_rate": 4.638255698711596e-06, + "loss": 1.0059, + "step": 203 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 1.9283210039138794, + "learning_rate": 4.633300297324084e-06, + "loss": 1.0192, + "step": 204 + }, + { + "epoch": 0.36541889483065954, + "grad_norm": 1.6795734167099, + "learning_rate": 4.628344895936571e-06, + "loss": 1.0084, + "step": 205 + }, + { + "epoch": 0.3672014260249554, + "grad_norm": 1.3590185642242432, + "learning_rate": 4.6233894945490585e-06, + "loss": 1.0137, + "step": 206 + }, + { + "epoch": 0.3689839572192513, + "grad_norm": 1.6634080410003662, + "learning_rate": 4.6184340931615466e-06, + "loss": 1.0059, + "step": 207 + }, + { + "epoch": 0.37076648841354726, + "grad_norm": 1.5877068042755127, + "learning_rate": 4.613478691774035e-06, + "loss": 1.0255, + "step": 208 + }, + { + "epoch": 0.37254901960784315, + "grad_norm": 1.8005928993225098, + "learning_rate": 4.608523290386522e-06, + "loss": 1.0474, + "step": 209 + }, + { + "epoch": 0.37433155080213903, + "grad_norm": 1.8539314270019531, + "learning_rate": 4.603567888999009e-06, + "loss": 1.011, + "step": 210 + }, + { + "epoch": 0.37433155080213903, + "eval_loss": 1.07635498046875, + "eval_runtime": 46.6192, + "eval_samples_per_second": 21.45, + "eval_steps_per_second": 1.351, + "step": 210 + }, + { + "epoch": 0.3761140819964349, + "grad_norm": 1.860028624534607, + "learning_rate": 4.598612487611497e-06, + "loss": 1.0141, + "step": 211 + }, + { + "epoch": 0.3778966131907308, + "grad_norm": 1.5086894035339355, + "learning_rate": 4.5936570862239844e-06, + "loss": 1.0376, + "step": 212 + }, + { + "epoch": 0.37967914438502676, + "grad_norm": 1.4755423069000244, + "learning_rate": 4.5887016848364725e-06, + "loss": 0.9872, + "step": 213 + }, + { + "epoch": 0.38146167557932265, + "grad_norm": 1.6511203050613403, + "learning_rate": 4.58374628344896e-06, + "loss": 1.0179, + "step": 214 + }, + { + "epoch": 0.38324420677361853, + "grad_norm": 1.6704275608062744, + "learning_rate": 4.578790882061447e-06, + "loss": 1.0257, + "step": 215 + }, + { + "epoch": 0.3850267379679144, + "grad_norm": 2.220994234085083, + "learning_rate": 4.573835480673935e-06, + "loss": 1.0393, + "step": 216 + }, + { + "epoch": 0.3868092691622103, + "grad_norm": 1.5368359088897705, + "learning_rate": 4.568880079286422e-06, + "loss": 0.9882, + "step": 217 + }, + { + "epoch": 0.38859180035650626, + "grad_norm": 2.0973968505859375, + "learning_rate": 4.5639246778989096e-06, + "loss": 1.0245, + "step": 218 + }, + { + "epoch": 0.39037433155080214, + "grad_norm": 1.4747536182403564, + "learning_rate": 4.558969276511398e-06, + "loss": 1.0095, + "step": 219 + }, + { + "epoch": 0.39215686274509803, + "grad_norm": 2.0183606147766113, + "learning_rate": 4.554013875123886e-06, + "loss": 0.9998, + "step": 220 + }, + { + "epoch": 0.39215686274509803, + "eval_loss": 1.0749539136886597, + "eval_runtime": 46.6635, + "eval_samples_per_second": 21.43, + "eval_steps_per_second": 1.35, + "step": 220 + }, + { + "epoch": 0.3939393939393939, + "grad_norm": 1.8413187265396118, + "learning_rate": 4.549058473736373e-06, + "loss": 1.0348, + "step": 221 + }, + { + "epoch": 0.39572192513368987, + "grad_norm": 1.7214562892913818, + "learning_rate": 4.54410307234886e-06, + "loss": 1.0154, + "step": 222 + }, + { + "epoch": 0.39750445632798576, + "grad_norm": 1.8179748058319092, + "learning_rate": 4.539147670961348e-06, + "loss": 0.9957, + "step": 223 + }, + { + "epoch": 0.39928698752228164, + "grad_norm": 1.905828833580017, + "learning_rate": 4.534192269573836e-06, + "loss": 0.9873, + "step": 224 + }, + { + "epoch": 0.40106951871657753, + "grad_norm": 1.3392513990402222, + "learning_rate": 4.529236868186324e-06, + "loss": 0.9834, + "step": 225 + }, + { + "epoch": 0.4028520499108734, + "grad_norm": 1.6817848682403564, + "learning_rate": 4.524281466798811e-06, + "loss": 0.9789, + "step": 226 + }, + { + "epoch": 0.40463458110516937, + "grad_norm": 1.7255293130874634, + "learning_rate": 4.519326065411299e-06, + "loss": 0.9868, + "step": 227 + }, + { + "epoch": 0.40641711229946526, + "grad_norm": 1.6086952686309814, + "learning_rate": 4.514370664023786e-06, + "loss": 0.9916, + "step": 228 + }, + { + "epoch": 0.40819964349376114, + "grad_norm": 2.0437726974487305, + "learning_rate": 4.509415262636274e-06, + "loss": 1.0168, + "step": 229 + }, + { + "epoch": 0.40998217468805703, + "grad_norm": 1.719529390335083, + "learning_rate": 4.5044598612487615e-06, + "loss": 1.0293, + "step": 230 + }, + { + "epoch": 0.40998217468805703, + "eval_loss": 1.0670229196548462, + "eval_runtime": 46.6553, + "eval_samples_per_second": 21.434, + "eval_steps_per_second": 1.35, + "step": 230 + }, + { + "epoch": 0.4117647058823529, + "grad_norm": 1.3140292167663574, + "learning_rate": 4.499504459861249e-06, + "loss": 0.98, + "step": 231 + }, + { + "epoch": 0.41354723707664887, + "grad_norm": 1.678130030632019, + "learning_rate": 4.494549058473737e-06, + "loss": 1.0202, + "step": 232 + }, + { + "epoch": 0.41532976827094475, + "grad_norm": 1.4645118713378906, + "learning_rate": 4.489593657086224e-06, + "loss": 0.9959, + "step": 233 + }, + { + "epoch": 0.41711229946524064, + "grad_norm": 1.5366790294647217, + "learning_rate": 4.484638255698711e-06, + "loss": 0.9832, + "step": 234 + }, + { + "epoch": 0.41889483065953653, + "grad_norm": 2.286126136779785, + "learning_rate": 4.479682854311199e-06, + "loss": 0.9998, + "step": 235 + }, + { + "epoch": 0.4206773618538324, + "grad_norm": 2.260972261428833, + "learning_rate": 4.4747274529236875e-06, + "loss": 1.0139, + "step": 236 + }, + { + "epoch": 0.42245989304812837, + "grad_norm": 1.9152151346206665, + "learning_rate": 4.469772051536175e-06, + "loss": 1.0002, + "step": 237 + }, + { + "epoch": 0.42424242424242425, + "grad_norm": 1.7313346862792969, + "learning_rate": 4.464816650148662e-06, + "loss": 0.9828, + "step": 238 + }, + { + "epoch": 0.42602495543672014, + "grad_norm": 1.9363431930541992, + "learning_rate": 4.45986124876115e-06, + "loss": 0.9889, + "step": 239 + }, + { + "epoch": 0.42780748663101603, + "grad_norm": 1.4276996850967407, + "learning_rate": 4.454905847373638e-06, + "loss": 1.0121, + "step": 240 + }, + { + "epoch": 0.42780748663101603, + "eval_loss": 1.0605522394180298, + "eval_runtime": 46.6086, + "eval_samples_per_second": 21.455, + "eval_steps_per_second": 1.352, + "step": 240 + }, + { + "epoch": 0.4295900178253119, + "grad_norm": 1.8374282121658325, + "learning_rate": 4.449950445986125e-06, + "loss": 1.006, + "step": 241 + }, + { + "epoch": 0.43137254901960786, + "grad_norm": 1.6907085180282593, + "learning_rate": 4.444995044598613e-06, + "loss": 0.9724, + "step": 242 + }, + { + "epoch": 0.43315508021390375, + "grad_norm": 1.5171328783035278, + "learning_rate": 4.440039643211101e-06, + "loss": 1.0077, + "step": 243 + }, + { + "epoch": 0.43493761140819964, + "grad_norm": 1.7695019245147705, + "learning_rate": 4.435084241823588e-06, + "loss": 0.9975, + "step": 244 + }, + { + "epoch": 0.43672014260249553, + "grad_norm": 1.7725054025650024, + "learning_rate": 4.430128840436076e-06, + "loss": 1.05, + "step": 245 + }, + { + "epoch": 0.4385026737967914, + "grad_norm": 1.385197639465332, + "learning_rate": 4.425173439048563e-06, + "loss": 0.9982, + "step": 246 + }, + { + "epoch": 0.44028520499108736, + "grad_norm": 1.5117247104644775, + "learning_rate": 4.420218037661051e-06, + "loss": 1.0047, + "step": 247 + }, + { + "epoch": 0.44206773618538325, + "grad_norm": 1.545850157737732, + "learning_rate": 4.415262636273539e-06, + "loss": 0.9703, + "step": 248 + }, + { + "epoch": 0.44385026737967914, + "grad_norm": 1.550068974494934, + "learning_rate": 4.410307234886026e-06, + "loss": 1.0437, + "step": 249 + }, + { + "epoch": 0.44563279857397503, + "grad_norm": 1.711711049079895, + "learning_rate": 4.405351833498513e-06, + "loss": 0.9987, + "step": 250 + }, + { + "epoch": 0.44563279857397503, + "eval_loss": 1.0570459365844727, + "eval_runtime": 46.6929, + "eval_samples_per_second": 21.417, + "eval_steps_per_second": 1.349, + "step": 250 + }, + { + "epoch": 0.4474153297682709, + "grad_norm": 1.3180267810821533, + "learning_rate": 4.400396432111001e-06, + "loss": 0.9692, + "step": 251 + }, + { + "epoch": 0.44919786096256686, + "grad_norm": 1.583044409751892, + "learning_rate": 4.395441030723489e-06, + "loss": 0.9882, + "step": 252 + }, + { + "epoch": 0.45098039215686275, + "grad_norm": 1.7625699043273926, + "learning_rate": 4.3904856293359765e-06, + "loss": 1.019, + "step": 253 + }, + { + "epoch": 0.45276292335115864, + "grad_norm": 1.416609287261963, + "learning_rate": 4.385530227948464e-06, + "loss": 0.9722, + "step": 254 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 1.2862454652786255, + "learning_rate": 4.380574826560952e-06, + "loss": 1.0051, + "step": 255 + }, + { + "epoch": 0.4563279857397504, + "grad_norm": 1.6484653949737549, + "learning_rate": 4.37561942517344e-06, + "loss": 0.9845, + "step": 256 + }, + { + "epoch": 0.45811051693404636, + "grad_norm": 1.7906770706176758, + "learning_rate": 4.370664023785927e-06, + "loss": 0.9639, + "step": 257 + }, + { + "epoch": 0.45989304812834225, + "grad_norm": 1.6653960943222046, + "learning_rate": 4.365708622398414e-06, + "loss": 1.0079, + "step": 258 + }, + { + "epoch": 0.46167557932263814, + "grad_norm": 1.40675687789917, + "learning_rate": 4.3607532210109024e-06, + "loss": 0.9742, + "step": 259 + }, + { + "epoch": 0.46345811051693403, + "grad_norm": 1.9555974006652832, + "learning_rate": 4.35579781962339e-06, + "loss": 1.0031, + "step": 260 + }, + { + "epoch": 0.46345811051693403, + "eval_loss": 1.05105459690094, + "eval_runtime": 46.5955, + "eval_samples_per_second": 21.461, + "eval_steps_per_second": 1.352, + "step": 260 + }, + { + "epoch": 0.46524064171123, + "grad_norm": 1.5779472589492798, + "learning_rate": 4.350842418235878e-06, + "loss": 1.0131, + "step": 261 + }, + { + "epoch": 0.46702317290552586, + "grad_norm": 2.0448193550109863, + "learning_rate": 4.345887016848365e-06, + "loss": 0.9952, + "step": 262 + }, + { + "epoch": 0.46880570409982175, + "grad_norm": 1.8460253477096558, + "learning_rate": 4.340931615460853e-06, + "loss": 0.9894, + "step": 263 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 1.5205235481262207, + "learning_rate": 4.33597621407334e-06, + "loss": 0.998, + "step": 264 + }, + { + "epoch": 0.47237076648841353, + "grad_norm": 1.6225255727767944, + "learning_rate": 4.3310208126858276e-06, + "loss": 1.0076, + "step": 265 + }, + { + "epoch": 0.4741532976827095, + "grad_norm": 1.5571482181549072, + "learning_rate": 4.326065411298316e-06, + "loss": 0.9738, + "step": 266 + }, + { + "epoch": 0.47593582887700536, + "grad_norm": 1.5609713792800903, + "learning_rate": 4.321110009910804e-06, + "loss": 0.9998, + "step": 267 + }, + { + "epoch": 0.47771836007130125, + "grad_norm": 1.6771962642669678, + "learning_rate": 4.316154608523291e-06, + "loss": 0.9775, + "step": 268 + }, + { + "epoch": 0.47950089126559714, + "grad_norm": 1.8408950567245483, + "learning_rate": 4.311199207135778e-06, + "loss": 1.0069, + "step": 269 + }, + { + "epoch": 0.48128342245989303, + "grad_norm": 1.3811196088790894, + "learning_rate": 4.3062438057482654e-06, + "loss": 1.0066, + "step": 270 + }, + { + "epoch": 0.48128342245989303, + "eval_loss": 1.0506361722946167, + "eval_runtime": 46.56, + "eval_samples_per_second": 21.478, + "eval_steps_per_second": 1.353, + "step": 270 + }, + { + "epoch": 0.483065953654189, + "grad_norm": 1.4970637559890747, + "learning_rate": 4.3012884043607535e-06, + "loss": 0.9513, + "step": 271 + }, + { + "epoch": 0.48484848484848486, + "grad_norm": 1.7834646701812744, + "learning_rate": 4.296333002973242e-06, + "loss": 0.9865, + "step": 272 + }, + { + "epoch": 0.48663101604278075, + "grad_norm": 1.3917527198791504, + "learning_rate": 4.291377601585729e-06, + "loss": 0.977, + "step": 273 + }, + { + "epoch": 0.48841354723707664, + "grad_norm": 1.5269951820373535, + "learning_rate": 4.286422200198216e-06, + "loss": 0.9977, + "step": 274 + }, + { + "epoch": 0.49019607843137253, + "grad_norm": 1.4431025981903076, + "learning_rate": 4.281466798810704e-06, + "loss": 0.9944, + "step": 275 + }, + { + "epoch": 0.4919786096256685, + "grad_norm": 1.6358513832092285, + "learning_rate": 4.276511397423191e-06, + "loss": 0.9737, + "step": 276 + }, + { + "epoch": 0.49376114081996436, + "grad_norm": 1.3993628025054932, + "learning_rate": 4.2715559960356795e-06, + "loss": 0.9756, + "step": 277 + }, + { + "epoch": 0.49554367201426025, + "grad_norm": 1.3654887676239014, + "learning_rate": 4.266600594648167e-06, + "loss": 0.9682, + "step": 278 + }, + { + "epoch": 0.49732620320855614, + "grad_norm": 1.3362301588058472, + "learning_rate": 4.261645193260655e-06, + "loss": 0.9647, + "step": 279 + }, + { + "epoch": 0.49910873440285203, + "grad_norm": 1.2186959981918335, + "learning_rate": 4.256689791873142e-06, + "loss": 0.9926, + "step": 280 + }, + { + "epoch": 0.49910873440285203, + "eval_loss": 1.0509613752365112, + "eval_runtime": 46.6119, + "eval_samples_per_second": 21.454, + "eval_steps_per_second": 1.352, + "step": 280 + }, + { + "epoch": 0.5008912655971479, + "grad_norm": 1.639351725578308, + "learning_rate": 4.251734390485629e-06, + "loss": 0.9713, + "step": 281 + }, + { + "epoch": 0.5026737967914439, + "grad_norm": 1.4995989799499512, + "learning_rate": 4.246778989098117e-06, + "loss": 0.9563, + "step": 282 + }, + { + "epoch": 0.5044563279857398, + "grad_norm": 1.5948710441589355, + "learning_rate": 4.2418235877106055e-06, + "loss": 1.0202, + "step": 283 + }, + { + "epoch": 0.5062388591800356, + "grad_norm": 1.8484435081481934, + "learning_rate": 4.236868186323093e-06, + "loss": 0.9919, + "step": 284 + }, + { + "epoch": 0.5080213903743316, + "grad_norm": 1.5352140665054321, + "learning_rate": 4.23191278493558e-06, + "loss": 1.0022, + "step": 285 + }, + { + "epoch": 0.5098039215686274, + "grad_norm": 1.3767133951187134, + "learning_rate": 4.226957383548068e-06, + "loss": 1.0292, + "step": 286 + }, + { + "epoch": 0.5115864527629234, + "grad_norm": 1.5115472078323364, + "learning_rate": 4.222001982160555e-06, + "loss": 0.9171, + "step": 287 + }, + { + "epoch": 0.5133689839572193, + "grad_norm": 1.3484890460968018, + "learning_rate": 4.217046580773043e-06, + "loss": 1.0022, + "step": 288 + }, + { + "epoch": 0.5151515151515151, + "grad_norm": 1.391195297241211, + "learning_rate": 4.212091179385531e-06, + "loss": 0.9813, + "step": 289 + }, + { + "epoch": 0.5169340463458111, + "grad_norm": 1.5086086988449097, + "learning_rate": 4.207135777998018e-06, + "loss": 0.9763, + "step": 290 + }, + { + "epoch": 0.5169340463458111, + "eval_loss": 1.0481914281845093, + "eval_runtime": 46.5639, + "eval_samples_per_second": 21.476, + "eval_steps_per_second": 1.353, + "step": 290 + }, + { + "epoch": 0.5187165775401069, + "grad_norm": 1.4498447179794312, + "learning_rate": 4.202180376610506e-06, + "loss": 0.9716, + "step": 291 + }, + { + "epoch": 0.5204991087344029, + "grad_norm": 1.4278929233551025, + "learning_rate": 4.197224975222993e-06, + "loss": 0.9837, + "step": 292 + }, + { + "epoch": 0.5222816399286988, + "grad_norm": 1.180837869644165, + "learning_rate": 4.192269573835481e-06, + "loss": 0.9888, + "step": 293 + }, + { + "epoch": 0.5240641711229946, + "grad_norm": 1.7972043752670288, + "learning_rate": 4.1873141724479685e-06, + "loss": 0.9611, + "step": 294 + }, + { + "epoch": 0.5258467023172906, + "grad_norm": 1.7201738357543945, + "learning_rate": 4.1823587710604566e-06, + "loss": 0.9771, + "step": 295 + }, + { + "epoch": 0.5276292335115864, + "grad_norm": 1.5042906999588013, + "learning_rate": 4.177403369672944e-06, + "loss": 0.9703, + "step": 296 + }, + { + "epoch": 0.5294117647058824, + "grad_norm": 1.6776472330093384, + "learning_rate": 4.172447968285431e-06, + "loss": 0.9677, + "step": 297 + }, + { + "epoch": 0.5311942959001783, + "grad_norm": 1.5473730564117432, + "learning_rate": 4.167492566897919e-06, + "loss": 0.9977, + "step": 298 + }, + { + "epoch": 0.5329768270944741, + "grad_norm": 1.3396865129470825, + "learning_rate": 4.162537165510407e-06, + "loss": 0.9766, + "step": 299 + }, + { + "epoch": 0.5347593582887701, + "grad_norm": 1.6403453350067139, + "learning_rate": 4.1575817641228945e-06, + "loss": 0.9906, + "step": 300 + }, + { + "epoch": 0.5347593582887701, + "eval_loss": 1.0455559492111206, + "eval_runtime": 46.5647, + "eval_samples_per_second": 21.476, + "eval_steps_per_second": 1.353, + "step": 300 + }, + { + "epoch": 0.5365418894830659, + "grad_norm": 1.3962197303771973, + "learning_rate": 4.152626362735382e-06, + "loss": 0.9825, + "step": 301 + }, + { + "epoch": 0.5383244206773619, + "grad_norm": 1.410086750984192, + "learning_rate": 4.14767096134787e-06, + "loss": 1.0086, + "step": 302 + }, + { + "epoch": 0.5401069518716578, + "grad_norm": 1.3633315563201904, + "learning_rate": 4.142715559960357e-06, + "loss": 0.9872, + "step": 303 + }, + { + "epoch": 0.5418894830659536, + "grad_norm": 1.3306059837341309, + "learning_rate": 4.137760158572845e-06, + "loss": 0.9645, + "step": 304 + }, + { + "epoch": 0.5436720142602496, + "grad_norm": 1.4005334377288818, + "learning_rate": 4.132804757185332e-06, + "loss": 1.0071, + "step": 305 + }, + { + "epoch": 0.5454545454545454, + "grad_norm": 1.3115915060043335, + "learning_rate": 4.12784935579782e-06, + "loss": 0.9449, + "step": 306 + }, + { + "epoch": 0.5472370766488414, + "grad_norm": 1.3861132860183716, + "learning_rate": 4.122893954410308e-06, + "loss": 0.9653, + "step": 307 + }, + { + "epoch": 0.5490196078431373, + "grad_norm": 1.7560895681381226, + "learning_rate": 4.117938553022795e-06, + "loss": 0.9823, + "step": 308 + }, + { + "epoch": 0.5508021390374331, + "grad_norm": 1.2997792959213257, + "learning_rate": 4.112983151635283e-06, + "loss": 0.9842, + "step": 309 + }, + { + "epoch": 0.5525846702317291, + "grad_norm": 1.1829795837402344, + "learning_rate": 4.10802775024777e-06, + "loss": 0.9696, + "step": 310 + }, + { + "epoch": 0.5525846702317291, + "eval_loss": 1.0410667657852173, + "eval_runtime": 46.5919, + "eval_samples_per_second": 21.463, + "eval_steps_per_second": 1.352, + "step": 310 + }, + { + "epoch": 0.5543672014260249, + "grad_norm": 1.5539531707763672, + "learning_rate": 4.103072348860258e-06, + "loss": 0.9614, + "step": 311 + }, + { + "epoch": 0.5561497326203209, + "grad_norm": 1.2344005107879639, + "learning_rate": 4.0981169474727456e-06, + "loss": 0.9712, + "step": 312 + }, + { + "epoch": 0.5579322638146168, + "grad_norm": 1.511521816253662, + "learning_rate": 4.093161546085233e-06, + "loss": 0.9629, + "step": 313 + }, + { + "epoch": 0.5597147950089126, + "grad_norm": 1.450751781463623, + "learning_rate": 4.088206144697721e-06, + "loss": 0.9496, + "step": 314 + }, + { + "epoch": 0.5614973262032086, + "grad_norm": 1.3266924619674683, + "learning_rate": 4.083250743310209e-06, + "loss": 0.9296, + "step": 315 + }, + { + "epoch": 0.5632798573975044, + "grad_norm": 1.4744081497192383, + "learning_rate": 4.078295341922696e-06, + "loss": 0.9991, + "step": 316 + }, + { + "epoch": 0.5650623885918004, + "grad_norm": 1.5017259120941162, + "learning_rate": 4.0733399405351834e-06, + "loss": 0.9906, + "step": 317 + }, + { + "epoch": 0.5668449197860963, + "grad_norm": 1.6157349348068237, + "learning_rate": 4.0683845391476715e-06, + "loss": 0.9707, + "step": 318 + }, + { + "epoch": 0.5686274509803921, + "grad_norm": 1.161723017692566, + "learning_rate": 4.063429137760159e-06, + "loss": 0.9634, + "step": 319 + }, + { + "epoch": 0.5704099821746881, + "grad_norm": 1.2189308404922485, + "learning_rate": 4.058473736372647e-06, + "loss": 0.9898, + "step": 320 + }, + { + "epoch": 0.5704099821746881, + "eval_loss": 1.0385627746582031, + "eval_runtime": 46.6868, + "eval_samples_per_second": 21.419, + "eval_steps_per_second": 1.349, + "step": 320 + }, + { + "epoch": 0.5721925133689839, + "grad_norm": 1.570832371711731, + "learning_rate": 4.053518334985134e-06, + "loss": 0.9551, + "step": 321 + }, + { + "epoch": 0.5739750445632799, + "grad_norm": 1.483991026878357, + "learning_rate": 4.048562933597622e-06, + "loss": 0.954, + "step": 322 + }, + { + "epoch": 0.5757575757575758, + "grad_norm": 1.3836183547973633, + "learning_rate": 4.043607532210109e-06, + "loss": 0.9457, + "step": 323 + }, + { + "epoch": 0.5775401069518716, + "grad_norm": 1.4889212846755981, + "learning_rate": 4.038652130822597e-06, + "loss": 0.9664, + "step": 324 + }, + { + "epoch": 0.5793226381461676, + "grad_norm": 1.424689769744873, + "learning_rate": 4.033696729435085e-06, + "loss": 0.9386, + "step": 325 + }, + { + "epoch": 0.5811051693404634, + "grad_norm": 1.5658745765686035, + "learning_rate": 4.028741328047572e-06, + "loss": 0.9984, + "step": 326 + }, + { + "epoch": 0.5828877005347594, + "grad_norm": 1.7062257528305054, + "learning_rate": 4.02378592666006e-06, + "loss": 0.9732, + "step": 327 + }, + { + "epoch": 0.5846702317290553, + "grad_norm": 1.586201786994934, + "learning_rate": 4.018830525272547e-06, + "loss": 0.9974, + "step": 328 + }, + { + "epoch": 0.5864527629233511, + "grad_norm": 1.6218819618225098, + "learning_rate": 4.0138751238850345e-06, + "loss": 0.9468, + "step": 329 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 1.2196019887924194, + "learning_rate": 4.008919722497523e-06, + "loss": 0.9194, + "step": 330 + }, + { + "epoch": 0.5882352941176471, + "eval_loss": 1.0374821424484253, + "eval_runtime": 46.6459, + "eval_samples_per_second": 21.438, + "eval_steps_per_second": 1.351, + "step": 330 + }, + { + "epoch": 0.5900178253119429, + "grad_norm": 1.5755513906478882, + "learning_rate": 4.003964321110011e-06, + "loss": 0.9806, + "step": 331 + }, + { + "epoch": 0.5918003565062389, + "grad_norm": 1.482846975326538, + "learning_rate": 3.999008919722498e-06, + "loss": 0.9713, + "step": 332 + }, + { + "epoch": 0.5935828877005348, + "grad_norm": 1.4488551616668701, + "learning_rate": 3.994053518334985e-06, + "loss": 0.932, + "step": 333 + }, + { + "epoch": 0.5953654188948306, + "grad_norm": 1.8037104606628418, + "learning_rate": 3.989098116947473e-06, + "loss": 0.973, + "step": 334 + }, + { + "epoch": 0.5971479500891266, + "grad_norm": 1.507331371307373, + "learning_rate": 3.9841427155599605e-06, + "loss": 0.989, + "step": 335 + }, + { + "epoch": 0.5989304812834224, + "grad_norm": 1.49600350856781, + "learning_rate": 3.979187314172449e-06, + "loss": 0.9764, + "step": 336 + }, + { + "epoch": 0.6007130124777184, + "grad_norm": 1.3340990543365479, + "learning_rate": 3.974231912784936e-06, + "loss": 0.9359, + "step": 337 + }, + { + "epoch": 0.6024955436720143, + "grad_norm": 1.6840732097625732, + "learning_rate": 3.969276511397424e-06, + "loss": 0.949, + "step": 338 + }, + { + "epoch": 0.6042780748663101, + "grad_norm": 1.650481104850769, + "learning_rate": 3.964321110009911e-06, + "loss": 0.9643, + "step": 339 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 1.87238609790802, + "learning_rate": 3.959365708622398e-06, + "loss": 0.9673, + "step": 340 + }, + { + "epoch": 0.6060606060606061, + "eval_loss": 1.034188985824585, + "eval_runtime": 46.5782, + "eval_samples_per_second": 21.469, + "eval_steps_per_second": 1.353, + "step": 340 + }, + { + "epoch": 0.6078431372549019, + "grad_norm": 1.3788107633590698, + "learning_rate": 3.9544103072348865e-06, + "loss": 0.9555, + "step": 341 + }, + { + "epoch": 0.6096256684491979, + "grad_norm": 1.4971482753753662, + "learning_rate": 3.949454905847374e-06, + "loss": 1.0169, + "step": 342 + }, + { + "epoch": 0.6114081996434938, + "grad_norm": 1.7131767272949219, + "learning_rate": 3.944499504459862e-06, + "loss": 0.9959, + "step": 343 + }, + { + "epoch": 0.6131907308377896, + "grad_norm": 1.3918700218200684, + "learning_rate": 3.939544103072349e-06, + "loss": 0.9582, + "step": 344 + }, + { + "epoch": 0.6149732620320856, + "grad_norm": 1.3024098873138428, + "learning_rate": 3.934588701684836e-06, + "loss": 0.9577, + "step": 345 + }, + { + "epoch": 0.6167557932263814, + "grad_norm": 1.5201256275177002, + "learning_rate": 3.929633300297324e-06, + "loss": 0.9554, + "step": 346 + }, + { + "epoch": 0.6185383244206774, + "grad_norm": 1.3050342798233032, + "learning_rate": 3.9246778989098124e-06, + "loss": 0.94, + "step": 347 + }, + { + "epoch": 0.6203208556149733, + "grad_norm": 1.3635145425796509, + "learning_rate": 3.9197224975223e-06, + "loss": 0.9789, + "step": 348 + }, + { + "epoch": 0.6221033868092691, + "grad_norm": 1.7963106632232666, + "learning_rate": 3.914767096134787e-06, + "loss": 0.9634, + "step": 349 + }, + { + "epoch": 0.6238859180035651, + "grad_norm": 1.6082175970077515, + "learning_rate": 3.909811694747275e-06, + "loss": 0.941, + "step": 350 + }, + { + "epoch": 0.6238859180035651, + "eval_loss": 1.034374713897705, + "eval_runtime": 46.5986, + "eval_samples_per_second": 21.46, + "eval_steps_per_second": 1.352, + "step": 350 + }, + { + "epoch": 0.6256684491978609, + "grad_norm": 1.575903296470642, + "learning_rate": 3.904856293359762e-06, + "loss": 0.9449, + "step": 351 + }, + { + "epoch": 0.6274509803921569, + "grad_norm": 1.6089097261428833, + "learning_rate": 3.89990089197225e-06, + "loss": 0.9581, + "step": 352 + }, + { + "epoch": 0.6292335115864528, + "grad_norm": 1.5840903520584106, + "learning_rate": 3.8949454905847376e-06, + "loss": 0.9521, + "step": 353 + }, + { + "epoch": 0.6310160427807486, + "grad_norm": 1.9419504404067993, + "learning_rate": 3.889990089197226e-06, + "loss": 0.9552, + "step": 354 + }, + { + "epoch": 0.6327985739750446, + "grad_norm": 1.3905870914459229, + "learning_rate": 3.885034687809713e-06, + "loss": 0.9477, + "step": 355 + }, + { + "epoch": 0.6345811051693404, + "grad_norm": 1.5863651037216187, + "learning_rate": 3.8800792864222e-06, + "loss": 0.9533, + "step": 356 + }, + { + "epoch": 0.6363636363636364, + "grad_norm": 1.7959046363830566, + "learning_rate": 3.875123885034688e-06, + "loss": 0.9568, + "step": 357 + }, + { + "epoch": 0.6381461675579323, + "grad_norm": 1.6786869764328003, + "learning_rate": 3.870168483647176e-06, + "loss": 0.9639, + "step": 358 + }, + { + "epoch": 0.6399286987522281, + "grad_norm": 1.3908863067626953, + "learning_rate": 3.8652130822596635e-06, + "loss": 0.9601, + "step": 359 + }, + { + "epoch": 0.6417112299465241, + "grad_norm": 1.9571239948272705, + "learning_rate": 3.860257680872151e-06, + "loss": 0.9578, + "step": 360 + }, + { + "epoch": 0.6417112299465241, + "eval_loss": 1.0281429290771484, + "eval_runtime": 46.6801, + "eval_samples_per_second": 21.422, + "eval_steps_per_second": 1.35, + "step": 360 + }, + { + "epoch": 0.64349376114082, + "grad_norm": 1.445169448852539, + "learning_rate": 3.855302279484638e-06, + "loss": 0.9706, + "step": 361 + }, + { + "epoch": 0.6452762923351159, + "grad_norm": 1.475993275642395, + "learning_rate": 3.850346878097126e-06, + "loss": 0.9396, + "step": 362 + }, + { + "epoch": 0.6470588235294118, + "grad_norm": 1.7329176664352417, + "learning_rate": 3.845391476709614e-06, + "loss": 0.965, + "step": 363 + }, + { + "epoch": 0.6488413547237076, + "grad_norm": 1.6262738704681396, + "learning_rate": 3.8404360753221014e-06, + "loss": 0.9343, + "step": 364 + }, + { + "epoch": 0.6506238859180036, + "grad_norm": 1.3373092412948608, + "learning_rate": 3.835480673934589e-06, + "loss": 0.9652, + "step": 365 + }, + { + "epoch": 0.6524064171122995, + "grad_norm": 1.3127856254577637, + "learning_rate": 3.830525272547077e-06, + "loss": 0.935, + "step": 366 + }, + { + "epoch": 0.6541889483065954, + "grad_norm": 1.8555575609207153, + "learning_rate": 3.825569871159564e-06, + "loss": 0.9491, + "step": 367 + }, + { + "epoch": 0.6559714795008913, + "grad_norm": 1.5025370121002197, + "learning_rate": 3.820614469772052e-06, + "loss": 0.9626, + "step": 368 + }, + { + "epoch": 0.6577540106951871, + "grad_norm": 1.4000506401062012, + "learning_rate": 3.815659068384539e-06, + "loss": 0.9603, + "step": 369 + }, + { + "epoch": 0.6595365418894831, + "grad_norm": 1.4358906745910645, + "learning_rate": 3.810703666997027e-06, + "loss": 0.956, + "step": 370 + }, + { + "epoch": 0.6595365418894831, + "eval_loss": 1.02617347240448, + "eval_runtime": 46.651, + "eval_samples_per_second": 21.436, + "eval_steps_per_second": 1.35, + "step": 370 + }, + { + "epoch": 0.661319073083779, + "grad_norm": 1.8912854194641113, + "learning_rate": 3.8057482656095146e-06, + "loss": 0.9657, + "step": 371 + }, + { + "epoch": 0.6631016042780749, + "grad_norm": 1.5366394519805908, + "learning_rate": 3.8007928642220023e-06, + "loss": 0.9673, + "step": 372 + }, + { + "epoch": 0.6648841354723708, + "grad_norm": 1.7613537311553955, + "learning_rate": 3.79583746283449e-06, + "loss": 0.9436, + "step": 373 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 1.1846625804901123, + "learning_rate": 3.7908820614469776e-06, + "loss": 0.9182, + "step": 374 + }, + { + "epoch": 0.6684491978609626, + "grad_norm": 1.6215946674346924, + "learning_rate": 3.7859266600594653e-06, + "loss": 0.9589, + "step": 375 + }, + { + "epoch": 0.6702317290552585, + "grad_norm": 1.6887030601501465, + "learning_rate": 3.7809712586719525e-06, + "loss": 0.9756, + "step": 376 + }, + { + "epoch": 0.6720142602495544, + "grad_norm": 1.6964572668075562, + "learning_rate": 3.77601585728444e-06, + "loss": 0.961, + "step": 377 + }, + { + "epoch": 0.6737967914438503, + "grad_norm": 1.435586929321289, + "learning_rate": 3.771060455896928e-06, + "loss": 0.9267, + "step": 378 + }, + { + "epoch": 0.6755793226381461, + "grad_norm": 1.7841671705245972, + "learning_rate": 3.766105054509416e-06, + "loss": 0.9518, + "step": 379 + }, + { + "epoch": 0.6773618538324421, + "grad_norm": 1.2004536390304565, + "learning_rate": 3.761149653121903e-06, + "loss": 0.9519, + "step": 380 + }, + { + "epoch": 0.6773618538324421, + "eval_loss": 1.0265334844589233, + "eval_runtime": 46.6028, + "eval_samples_per_second": 21.458, + "eval_steps_per_second": 1.352, + "step": 380 + }, + { + "epoch": 0.679144385026738, + "grad_norm": 1.3047566413879395, + "learning_rate": 3.756194251734391e-06, + "loss": 0.9484, + "step": 381 + }, + { + "epoch": 0.6809269162210339, + "grad_norm": 1.5224716663360596, + "learning_rate": 3.7512388503468785e-06, + "loss": 0.9477, + "step": 382 + }, + { + "epoch": 0.6827094474153298, + "grad_norm": 1.3004342317581177, + "learning_rate": 3.7462834489593657e-06, + "loss": 0.9604, + "step": 383 + }, + { + "epoch": 0.6844919786096256, + "grad_norm": 1.3921082019805908, + "learning_rate": 3.741328047571854e-06, + "loss": 0.8758, + "step": 384 + }, + { + "epoch": 0.6862745098039216, + "grad_norm": 1.5825459957122803, + "learning_rate": 3.7363726461843415e-06, + "loss": 0.931, + "step": 385 + }, + { + "epoch": 0.6880570409982175, + "grad_norm": 1.5469566583633423, + "learning_rate": 3.7314172447968287e-06, + "loss": 0.9627, + "step": 386 + }, + { + "epoch": 0.6898395721925134, + "grad_norm": 1.2618225812911987, + "learning_rate": 3.7264618434093164e-06, + "loss": 0.9368, + "step": 387 + }, + { + "epoch": 0.6916221033868093, + "grad_norm": 1.4424333572387695, + "learning_rate": 3.721506442021804e-06, + "loss": 0.9692, + "step": 388 + }, + { + "epoch": 0.6934046345811051, + "grad_norm": 1.6238356828689575, + "learning_rate": 3.716551040634292e-06, + "loss": 0.9367, + "step": 389 + }, + { + "epoch": 0.6951871657754011, + "grad_norm": 1.4406427145004272, + "learning_rate": 3.7115956392467794e-06, + "loss": 0.954, + "step": 390 + }, + { + "epoch": 0.6951871657754011, + "eval_loss": 1.0200062990188599, + "eval_runtime": 46.7033, + "eval_samples_per_second": 21.412, + "eval_steps_per_second": 1.349, + "step": 390 + }, + { + "epoch": 0.696969696969697, + "grad_norm": 1.437024712562561, + "learning_rate": 3.706640237859267e-06, + "loss": 0.9506, + "step": 391 + }, + { + "epoch": 0.6987522281639929, + "grad_norm": 1.4848400354385376, + "learning_rate": 3.7016848364717543e-06, + "loss": 0.9205, + "step": 392 + }, + { + "epoch": 0.7005347593582888, + "grad_norm": 1.484315276145935, + "learning_rate": 3.696729435084242e-06, + "loss": 0.9379, + "step": 393 + }, + { + "epoch": 0.7023172905525846, + "grad_norm": 1.4815152883529663, + "learning_rate": 3.6917740336967296e-06, + "loss": 0.9742, + "step": 394 + }, + { + "epoch": 0.7040998217468806, + "grad_norm": 1.586028814315796, + "learning_rate": 3.6868186323092177e-06, + "loss": 0.9397, + "step": 395 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 1.713163137435913, + "learning_rate": 3.681863230921705e-06, + "loss": 0.9133, + "step": 396 + }, + { + "epoch": 0.7076648841354723, + "grad_norm": 1.3876514434814453, + "learning_rate": 3.6769078295341926e-06, + "loss": 0.9483, + "step": 397 + }, + { + "epoch": 0.7094474153297683, + "grad_norm": 1.3811695575714111, + "learning_rate": 3.6719524281466802e-06, + "loss": 0.9432, + "step": 398 + }, + { + "epoch": 0.7112299465240641, + "grad_norm": 1.4277275800704956, + "learning_rate": 3.6669970267591675e-06, + "loss": 0.9606, + "step": 399 + }, + { + "epoch": 0.7130124777183601, + "grad_norm": 1.2409071922302246, + "learning_rate": 3.6620416253716556e-06, + "loss": 0.9754, + "step": 400 + }, + { + "epoch": 0.7130124777183601, + "eval_loss": 1.0203064680099487, + "eval_runtime": 46.6978, + "eval_samples_per_second": 21.414, + "eval_steps_per_second": 1.349, + "step": 400 + }, + { + "epoch": 0.714795008912656, + "grad_norm": 1.7478358745574951, + "learning_rate": 3.6570862239841432e-06, + "loss": 0.9343, + "step": 401 + }, + { + "epoch": 0.7165775401069518, + "grad_norm": 1.3118643760681152, + "learning_rate": 3.6521308225966305e-06, + "loss": 0.9459, + "step": 402 + }, + { + "epoch": 0.7183600713012478, + "grad_norm": 1.6754299402236938, + "learning_rate": 3.647175421209118e-06, + "loss": 0.9477, + "step": 403 + }, + { + "epoch": 0.7201426024955436, + "grad_norm": 1.4706963300704956, + "learning_rate": 3.6422200198216058e-06, + "loss": 0.9237, + "step": 404 + }, + { + "epoch": 0.7219251336898396, + "grad_norm": 1.7521008253097534, + "learning_rate": 3.637264618434094e-06, + "loss": 0.9473, + "step": 405 + }, + { + "epoch": 0.7237076648841355, + "grad_norm": 1.5755548477172852, + "learning_rate": 3.632309217046581e-06, + "loss": 0.9632, + "step": 406 + }, + { + "epoch": 0.7254901960784313, + "grad_norm": 1.6598784923553467, + "learning_rate": 3.6273538156590688e-06, + "loss": 0.96, + "step": 407 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 1.4760100841522217, + "learning_rate": 3.6223984142715564e-06, + "loss": 0.9562, + "step": 408 + }, + { + "epoch": 0.7290552584670231, + "grad_norm": 1.5755516290664673, + "learning_rate": 3.6174430128840437e-06, + "loss": 0.9244, + "step": 409 + }, + { + "epoch": 0.7308377896613191, + "grad_norm": 1.3510775566101074, + "learning_rate": 3.6124876114965313e-06, + "loss": 0.9179, + "step": 410 + }, + { + "epoch": 0.7308377896613191, + "eval_loss": 1.0200093984603882, + "eval_runtime": 46.669, + "eval_samples_per_second": 21.427, + "eval_steps_per_second": 1.35, + "step": 410 + }, + { + "epoch": 0.732620320855615, + "grad_norm": 1.5007761716842651, + "learning_rate": 3.6075322101090194e-06, + "loss": 0.9746, + "step": 411 + }, + { + "epoch": 0.7344028520499108, + "grad_norm": 1.417822241783142, + "learning_rate": 3.6025768087215067e-06, + "loss": 0.9199, + "step": 412 + }, + { + "epoch": 0.7361853832442068, + "grad_norm": 1.534996509552002, + "learning_rate": 3.5976214073339943e-06, + "loss": 0.9603, + "step": 413 + }, + { + "epoch": 0.7379679144385026, + "grad_norm": 1.7224639654159546, + "learning_rate": 3.592666005946482e-06, + "loss": 0.9735, + "step": 414 + }, + { + "epoch": 0.7397504456327986, + "grad_norm": 1.594838261604309, + "learning_rate": 3.5877106045589692e-06, + "loss": 0.9362, + "step": 415 + }, + { + "epoch": 0.7415329768270945, + "grad_norm": 1.3750888109207153, + "learning_rate": 3.5827552031714573e-06, + "loss": 0.9365, + "step": 416 + }, + { + "epoch": 0.7433155080213903, + "grad_norm": 1.5538432598114014, + "learning_rate": 3.577799801783945e-06, + "loss": 0.9562, + "step": 417 + }, + { + "epoch": 0.7450980392156863, + "grad_norm": 1.4937995672225952, + "learning_rate": 3.5728444003964326e-06, + "loss": 0.9139, + "step": 418 + }, + { + "epoch": 0.7468805704099821, + "grad_norm": 1.2770204544067383, + "learning_rate": 3.56788899900892e-06, + "loss": 0.9246, + "step": 419 + }, + { + "epoch": 0.7486631016042781, + "grad_norm": 1.5773707628250122, + "learning_rate": 3.5629335976214075e-06, + "loss": 0.9558, + "step": 420 + }, + { + "epoch": 0.7486631016042781, + "eval_loss": 1.018223524093628, + "eval_runtime": 46.7216, + "eval_samples_per_second": 21.403, + "eval_steps_per_second": 1.348, + "step": 420 + }, + { + "epoch": 0.750445632798574, + "grad_norm": 1.7162814140319824, + "learning_rate": 3.5579781962338956e-06, + "loss": 0.9187, + "step": 421 + }, + { + "epoch": 0.7522281639928698, + "grad_norm": 1.4196544885635376, + "learning_rate": 3.553022794846383e-06, + "loss": 0.9518, + "step": 422 + }, + { + "epoch": 0.7540106951871658, + "grad_norm": 1.3683091402053833, + "learning_rate": 3.5480673934588705e-06, + "loss": 0.9248, + "step": 423 + }, + { + "epoch": 0.7557932263814616, + "grad_norm": 1.3377984762191772, + "learning_rate": 3.543111992071358e-06, + "loss": 0.9598, + "step": 424 + }, + { + "epoch": 0.7575757575757576, + "grad_norm": 1.5043303966522217, + "learning_rate": 3.5381565906838454e-06, + "loss": 0.9175, + "step": 425 + }, + { + "epoch": 0.7593582887700535, + "grad_norm": 1.3370412588119507, + "learning_rate": 3.533201189296333e-06, + "loss": 0.9645, + "step": 426 + }, + { + "epoch": 0.7611408199643493, + "grad_norm": 1.914324402809143, + "learning_rate": 3.528245787908821e-06, + "loss": 0.9281, + "step": 427 + }, + { + "epoch": 0.7629233511586453, + "grad_norm": 1.2636290788650513, + "learning_rate": 3.523290386521309e-06, + "loss": 0.9162, + "step": 428 + }, + { + "epoch": 0.7647058823529411, + "grad_norm": 1.3550281524658203, + "learning_rate": 3.518334985133796e-06, + "loss": 0.948, + "step": 429 + }, + { + "epoch": 0.7664884135472371, + "grad_norm": 1.3391727209091187, + "learning_rate": 3.5133795837462837e-06, + "loss": 0.9303, + "step": 430 + }, + { + "epoch": 0.7664884135472371, + "eval_loss": 1.0136384963989258, + "eval_runtime": 46.6342, + "eval_samples_per_second": 21.443, + "eval_steps_per_second": 1.351, + "step": 430 + }, + { + "epoch": 0.768270944741533, + "grad_norm": 1.5297709703445435, + "learning_rate": 3.508424182358771e-06, + "loss": 0.98, + "step": 431 + }, + { + "epoch": 0.7700534759358288, + "grad_norm": 1.2201571464538574, + "learning_rate": 3.503468780971259e-06, + "loss": 0.9274, + "step": 432 + }, + { + "epoch": 0.7718360071301248, + "grad_norm": 1.2383842468261719, + "learning_rate": 3.4985133795837467e-06, + "loss": 0.9424, + "step": 433 + }, + { + "epoch": 0.7736185383244206, + "grad_norm": 1.5169589519500732, + "learning_rate": 3.4935579781962344e-06, + "loss": 0.9635, + "step": 434 + }, + { + "epoch": 0.7754010695187166, + "grad_norm": 1.2745269536972046, + "learning_rate": 3.4886025768087216e-06, + "loss": 0.9307, + "step": 435 + }, + { + "epoch": 0.7771836007130125, + "grad_norm": 1.3778202533721924, + "learning_rate": 3.4836471754212093e-06, + "loss": 0.9197, + "step": 436 + }, + { + "epoch": 0.7789661319073083, + "grad_norm": 1.466562271118164, + "learning_rate": 3.4786917740336974e-06, + "loss": 0.9332, + "step": 437 + }, + { + "epoch": 0.7807486631016043, + "grad_norm": 1.4385347366333008, + "learning_rate": 3.4737363726461846e-06, + "loss": 0.9458, + "step": 438 + }, + { + "epoch": 0.7825311942959001, + "grad_norm": 1.6402126550674438, + "learning_rate": 3.4687809712586723e-06, + "loss": 0.9622, + "step": 439 + }, + { + "epoch": 0.7843137254901961, + "grad_norm": 1.5755741596221924, + "learning_rate": 3.46382556987116e-06, + "loss": 0.9608, + "step": 440 + }, + { + "epoch": 0.7843137254901961, + "eval_loss": 1.010831356048584, + "eval_runtime": 46.6146, + "eval_samples_per_second": 21.452, + "eval_steps_per_second": 1.352, + "step": 440 + }, + { + "epoch": 0.786096256684492, + "grad_norm": 1.723796010017395, + "learning_rate": 3.458870168483647e-06, + "loss": 0.945, + "step": 441 + }, + { + "epoch": 0.7878787878787878, + "grad_norm": 1.3989365100860596, + "learning_rate": 3.453914767096135e-06, + "loss": 0.931, + "step": 442 + }, + { + "epoch": 0.7896613190730838, + "grad_norm": 1.5464838743209839, + "learning_rate": 3.448959365708623e-06, + "loss": 0.9168, + "step": 443 + }, + { + "epoch": 0.7914438502673797, + "grad_norm": 1.7541731595993042, + "learning_rate": 3.4440039643211106e-06, + "loss": 0.9397, + "step": 444 + }, + { + "epoch": 0.7932263814616756, + "grad_norm": 1.467142939567566, + "learning_rate": 3.439048562933598e-06, + "loss": 0.9484, + "step": 445 + }, + { + "epoch": 0.7950089126559715, + "grad_norm": 1.7130905389785767, + "learning_rate": 3.4340931615460855e-06, + "loss": 0.9211, + "step": 446 + }, + { + "epoch": 0.7967914438502673, + "grad_norm": 1.4574313163757324, + "learning_rate": 3.4291377601585727e-06, + "loss": 0.9027, + "step": 447 + }, + { + "epoch": 0.7985739750445633, + "grad_norm": 1.7143244743347168, + "learning_rate": 3.424182358771061e-06, + "loss": 0.9465, + "step": 448 + }, + { + "epoch": 0.8003565062388592, + "grad_norm": 1.5416609048843384, + "learning_rate": 3.4192269573835485e-06, + "loss": 0.9332, + "step": 449 + }, + { + "epoch": 0.8021390374331551, + "grad_norm": 1.456154704093933, + "learning_rate": 3.414271555996036e-06, + "loss": 0.9729, + "step": 450 + }, + { + "epoch": 0.8021390374331551, + "eval_loss": 1.0104098320007324, + "eval_runtime": 46.6938, + "eval_samples_per_second": 21.416, + "eval_steps_per_second": 1.349, + "step": 450 + }, + { + "epoch": 0.803921568627451, + "grad_norm": 1.4171167612075806, + "learning_rate": 3.4093161546085234e-06, + "loss": 0.9444, + "step": 451 + }, + { + "epoch": 0.8057040998217468, + "grad_norm": 1.294493317604065, + "learning_rate": 3.404360753221011e-06, + "loss": 0.9351, + "step": 452 + }, + { + "epoch": 0.8074866310160428, + "grad_norm": 1.312842607498169, + "learning_rate": 3.399405351833499e-06, + "loss": 0.9246, + "step": 453 + }, + { + "epoch": 0.8092691622103387, + "grad_norm": 1.524051308631897, + "learning_rate": 3.3944499504459868e-06, + "loss": 0.9531, + "step": 454 + }, + { + "epoch": 0.8110516934046346, + "grad_norm": 1.317704439163208, + "learning_rate": 3.389494549058474e-06, + "loss": 0.9273, + "step": 455 + }, + { + "epoch": 0.8128342245989305, + "grad_norm": 1.7461873292922974, + "learning_rate": 3.3845391476709617e-06, + "loss": 0.9193, + "step": 456 + }, + { + "epoch": 0.8146167557932263, + "grad_norm": 1.4218385219573975, + "learning_rate": 3.379583746283449e-06, + "loss": 0.9355, + "step": 457 + }, + { + "epoch": 0.8163992869875223, + "grad_norm": 1.3833569288253784, + "learning_rate": 3.3746283448959366e-06, + "loss": 0.9197, + "step": 458 + }, + { + "epoch": 0.8181818181818182, + "grad_norm": 1.394129991531372, + "learning_rate": 3.3696729435084247e-06, + "loss": 0.9521, + "step": 459 + }, + { + "epoch": 0.8199643493761141, + "grad_norm": 1.4939794540405273, + "learning_rate": 3.3647175421209123e-06, + "loss": 0.923, + "step": 460 + }, + { + "epoch": 0.8199643493761141, + "eval_loss": 1.0103003978729248, + "eval_runtime": 46.6448, + "eval_samples_per_second": 21.439, + "eval_steps_per_second": 1.351, + "step": 460 + }, + { + "epoch": 0.82174688057041, + "grad_norm": 1.37795090675354, + "learning_rate": 3.3597621407333996e-06, + "loss": 0.9514, + "step": 461 + }, + { + "epoch": 0.8235294117647058, + "grad_norm": 1.3167624473571777, + "learning_rate": 3.3548067393458872e-06, + "loss": 0.9349, + "step": 462 + }, + { + "epoch": 0.8253119429590018, + "grad_norm": 1.4826208353042603, + "learning_rate": 3.349851337958375e-06, + "loss": 0.9225, + "step": 463 + }, + { + "epoch": 0.8270944741532977, + "grad_norm": 1.6088447570800781, + "learning_rate": 3.344895936570863e-06, + "loss": 0.9237, + "step": 464 + }, + { + "epoch": 0.8288770053475936, + "grad_norm": 1.490053653717041, + "learning_rate": 3.33994053518335e-06, + "loss": 0.9679, + "step": 465 + }, + { + "epoch": 0.8306595365418895, + "grad_norm": 1.485168695449829, + "learning_rate": 3.334985133795838e-06, + "loss": 0.9425, + "step": 466 + }, + { + "epoch": 0.8324420677361853, + "grad_norm": 1.6942230463027954, + "learning_rate": 3.330029732408325e-06, + "loss": 0.9106, + "step": 467 + }, + { + "epoch": 0.8342245989304813, + "grad_norm": 1.4257986545562744, + "learning_rate": 3.3250743310208128e-06, + "loss": 0.9405, + "step": 468 + }, + { + "epoch": 0.8360071301247772, + "grad_norm": 1.4217643737792969, + "learning_rate": 3.3201189296333004e-06, + "loss": 0.9197, + "step": 469 + }, + { + "epoch": 0.8377896613190731, + "grad_norm": 1.3932244777679443, + "learning_rate": 3.3151635282457885e-06, + "loss": 0.9177, + "step": 470 + }, + { + "epoch": 0.8377896613190731, + "eval_loss": 1.0090515613555908, + "eval_runtime": 46.8153, + "eval_samples_per_second": 21.361, + "eval_steps_per_second": 1.346, + "step": 470 + }, + { + "epoch": 0.839572192513369, + "grad_norm": 1.2177809476852417, + "learning_rate": 3.3102081268582757e-06, + "loss": 0.92, + "step": 471 + }, + { + "epoch": 0.8413547237076648, + "grad_norm": 1.2709245681762695, + "learning_rate": 3.3052527254707634e-06, + "loss": 0.9185, + "step": 472 + }, + { + "epoch": 0.8431372549019608, + "grad_norm": 1.4141621589660645, + "learning_rate": 3.300297324083251e-06, + "loss": 0.9332, + "step": 473 + }, + { + "epoch": 0.8449197860962567, + "grad_norm": 1.4409421682357788, + "learning_rate": 3.2953419226957383e-06, + "loss": 0.9604, + "step": 474 + }, + { + "epoch": 0.8467023172905526, + "grad_norm": 1.2500907182693481, + "learning_rate": 3.2903865213082264e-06, + "loss": 0.9369, + "step": 475 + }, + { + "epoch": 0.8484848484848485, + "grad_norm": 1.3185211420059204, + "learning_rate": 3.285431119920714e-06, + "loss": 0.9409, + "step": 476 + }, + { + "epoch": 0.8502673796791443, + "grad_norm": 1.2824376821517944, + "learning_rate": 3.2804757185332013e-06, + "loss": 0.9829, + "step": 477 + }, + { + "epoch": 0.8520499108734403, + "grad_norm": 1.4796322584152222, + "learning_rate": 3.275520317145689e-06, + "loss": 0.9282, + "step": 478 + }, + { + "epoch": 0.8538324420677362, + "grad_norm": 1.5836542844772339, + "learning_rate": 3.2705649157581766e-06, + "loss": 0.9382, + "step": 479 + }, + { + "epoch": 0.8556149732620321, + "grad_norm": 1.553688645362854, + "learning_rate": 3.2656095143706647e-06, + "loss": 0.9423, + "step": 480 + }, + { + "epoch": 0.8556149732620321, + "eval_loss": 1.0048539638519287, + "eval_runtime": 46.7514, + "eval_samples_per_second": 21.39, + "eval_steps_per_second": 1.348, + "step": 480 + }, + { + "epoch": 0.857397504456328, + "grad_norm": 1.568942666053772, + "learning_rate": 3.260654112983152e-06, + "loss": 0.9206, + "step": 481 + }, + { + "epoch": 0.8591800356506238, + "grad_norm": 1.928328037261963, + "learning_rate": 3.2556987115956396e-06, + "loss": 0.9357, + "step": 482 + }, + { + "epoch": 0.8609625668449198, + "grad_norm": 1.5885226726531982, + "learning_rate": 3.2507433102081273e-06, + "loss": 0.9142, + "step": 483 + }, + { + "epoch": 0.8627450980392157, + "grad_norm": 1.5181794166564941, + "learning_rate": 3.2457879088206145e-06, + "loss": 0.8949, + "step": 484 + }, + { + "epoch": 0.8645276292335116, + "grad_norm": 1.2631601095199585, + "learning_rate": 3.240832507433102e-06, + "loss": 0.9369, + "step": 485 + }, + { + "epoch": 0.8663101604278075, + "grad_norm": 1.3040809631347656, + "learning_rate": 3.2358771060455903e-06, + "loss": 0.9243, + "step": 486 + }, + { + "epoch": 0.8680926916221033, + "grad_norm": 1.8351012468338013, + "learning_rate": 3.2309217046580775e-06, + "loss": 0.9463, + "step": 487 + }, + { + "epoch": 0.8698752228163993, + "grad_norm": 1.5455973148345947, + "learning_rate": 3.225966303270565e-06, + "loss": 0.9203, + "step": 488 + }, + { + "epoch": 0.8716577540106952, + "grad_norm": 1.3624253273010254, + "learning_rate": 3.221010901883053e-06, + "loss": 0.8842, + "step": 489 + }, + { + "epoch": 0.8734402852049911, + "grad_norm": 1.2611980438232422, + "learning_rate": 3.21605550049554e-06, + "loss": 0.9197, + "step": 490 + }, + { + "epoch": 0.8734402852049911, + "eval_loss": 1.0056157112121582, + "eval_runtime": 46.6828, + "eval_samples_per_second": 21.421, + "eval_steps_per_second": 1.35, + "step": 490 + }, + { + "epoch": 0.875222816399287, + "grad_norm": 1.5297733545303345, + "learning_rate": 3.211100099108028e-06, + "loss": 0.9202, + "step": 491 + }, + { + "epoch": 0.8770053475935828, + "grad_norm": 1.541585922241211, + "learning_rate": 3.206144697720516e-06, + "loss": 0.9288, + "step": 492 + }, + { + "epoch": 0.8787878787878788, + "grad_norm": 1.3235573768615723, + "learning_rate": 3.201189296333003e-06, + "loss": 0.9242, + "step": 493 + }, + { + "epoch": 0.8805704099821747, + "grad_norm": 1.4747593402862549, + "learning_rate": 3.1962338949454907e-06, + "loss": 0.918, + "step": 494 + }, + { + "epoch": 0.8823529411764706, + "grad_norm": 1.530286192893982, + "learning_rate": 3.1912784935579784e-06, + "loss": 0.9223, + "step": 495 + }, + { + "epoch": 0.8841354723707665, + "grad_norm": 1.5007365942001343, + "learning_rate": 3.1863230921704664e-06, + "loss": 0.9273, + "step": 496 + }, + { + "epoch": 0.8859180035650623, + "grad_norm": 1.6092145442962646, + "learning_rate": 3.1813676907829537e-06, + "loss": 0.9085, + "step": 497 + }, + { + "epoch": 0.8877005347593583, + "grad_norm": 1.4303230047225952, + "learning_rate": 3.1764122893954413e-06, + "loss": 0.9255, + "step": 498 + }, + { + "epoch": 0.8894830659536542, + "grad_norm": 1.3100906610488892, + "learning_rate": 3.171456888007929e-06, + "loss": 0.8832, + "step": 499 + }, + { + "epoch": 0.8912655971479501, + "grad_norm": 1.608756184577942, + "learning_rate": 3.1665014866204162e-06, + "loss": 0.9347, + "step": 500 + }, + { + "epoch": 0.8912655971479501, + "eval_loss": 1.004418969154358, + "eval_runtime": 46.7969, + "eval_samples_per_second": 21.369, + "eval_steps_per_second": 1.346, + "step": 500 + }, + { + "epoch": 0.893048128342246, + "grad_norm": 1.752517819404602, + "learning_rate": 3.161546085232904e-06, + "loss": 0.9387, + "step": 501 + }, + { + "epoch": 0.8948306595365418, + "grad_norm": 1.4832419157028198, + "learning_rate": 3.156590683845392e-06, + "loss": 0.953, + "step": 502 + }, + { + "epoch": 0.8966131907308378, + "grad_norm": 1.3396515846252441, + "learning_rate": 3.1516352824578792e-06, + "loss": 0.9047, + "step": 503 + }, + { + "epoch": 0.8983957219251337, + "grad_norm": 1.3921819925308228, + "learning_rate": 3.146679881070367e-06, + "loss": 0.9185, + "step": 504 + }, + { + "epoch": 0.9001782531194296, + "grad_norm": 1.469477891921997, + "learning_rate": 3.1417244796828546e-06, + "loss": 0.939, + "step": 505 + }, + { + "epoch": 0.9019607843137255, + "grad_norm": 1.3249036073684692, + "learning_rate": 3.136769078295342e-06, + "loss": 0.9455, + "step": 506 + }, + { + "epoch": 0.9037433155080213, + "grad_norm": 1.5113000869750977, + "learning_rate": 3.13181367690783e-06, + "loss": 0.9438, + "step": 507 + }, + { + "epoch": 0.9055258467023173, + "grad_norm": 1.2723312377929688, + "learning_rate": 3.1268582755203175e-06, + "loss": 0.882, + "step": 508 + }, + { + "epoch": 0.9073083778966132, + "grad_norm": 1.5971113443374634, + "learning_rate": 3.121902874132805e-06, + "loss": 0.9296, + "step": 509 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 1.5077036619186401, + "learning_rate": 3.1169474727452924e-06, + "loss": 0.936, + "step": 510 + }, + { + "epoch": 0.9090909090909091, + "eval_loss": 1.0027379989624023, + "eval_runtime": 46.6635, + "eval_samples_per_second": 21.43, + "eval_steps_per_second": 1.35, + "step": 510 + }, + { + "epoch": 0.910873440285205, + "grad_norm": 1.405448317527771, + "learning_rate": 3.11199207135778e-06, + "loss": 0.9213, + "step": 511 + }, + { + "epoch": 0.9126559714795008, + "grad_norm": 1.4942853450775146, + "learning_rate": 3.107036669970268e-06, + "loss": 0.9436, + "step": 512 + }, + { + "epoch": 0.9144385026737968, + "grad_norm": 1.7423145771026611, + "learning_rate": 3.1020812685827554e-06, + "loss": 0.9292, + "step": 513 + }, + { + "epoch": 0.9162210338680927, + "grad_norm": 1.383570671081543, + "learning_rate": 3.097125867195243e-06, + "loss": 0.9229, + "step": 514 + }, + { + "epoch": 0.9180035650623886, + "grad_norm": 1.337597131729126, + "learning_rate": 3.0921704658077308e-06, + "loss": 0.9312, + "step": 515 + }, + { + "epoch": 0.9197860962566845, + "grad_norm": 1.3440173864364624, + "learning_rate": 3.087215064420218e-06, + "loss": 0.9327, + "step": 516 + }, + { + "epoch": 0.9215686274509803, + "grad_norm": 1.2668343782424927, + "learning_rate": 3.0822596630327057e-06, + "loss": 0.8952, + "step": 517 + }, + { + "epoch": 0.9233511586452763, + "grad_norm": 1.346970796585083, + "learning_rate": 3.0773042616451937e-06, + "loss": 0.8791, + "step": 518 + }, + { + "epoch": 0.9251336898395722, + "grad_norm": 1.4276890754699707, + "learning_rate": 3.0723488602576814e-06, + "loss": 0.9303, + "step": 519 + }, + { + "epoch": 0.9269162210338681, + "grad_norm": 1.7119300365447998, + "learning_rate": 3.0673934588701686e-06, + "loss": 0.9023, + "step": 520 + }, + { + "epoch": 0.9269162210338681, + "eval_loss": 1.0006085634231567, + "eval_runtime": 46.6636, + "eval_samples_per_second": 21.43, + "eval_steps_per_second": 1.35, + "step": 520 + }, + { + "epoch": 0.928698752228164, + "grad_norm": 1.5053153038024902, + "learning_rate": 3.0624380574826563e-06, + "loss": 0.9321, + "step": 521 + }, + { + "epoch": 0.93048128342246, + "grad_norm": 1.4875434637069702, + "learning_rate": 3.0574826560951435e-06, + "loss": 0.9112, + "step": 522 + }, + { + "epoch": 0.9322638146167558, + "grad_norm": 1.5411895513534546, + "learning_rate": 3.0525272547076316e-06, + "loss": 0.9279, + "step": 523 + }, + { + "epoch": 0.9340463458110517, + "grad_norm": 1.7409446239471436, + "learning_rate": 3.0475718533201193e-06, + "loss": 0.8981, + "step": 524 + }, + { + "epoch": 0.9358288770053476, + "grad_norm": 1.537593126296997, + "learning_rate": 3.042616451932607e-06, + "loss": 0.9232, + "step": 525 + }, + { + "epoch": 0.9376114081996435, + "grad_norm": 1.4763727188110352, + "learning_rate": 3.037661050545094e-06, + "loss": 0.9251, + "step": 526 + }, + { + "epoch": 0.9393939393939394, + "grad_norm": 1.4310897588729858, + "learning_rate": 3.032705649157582e-06, + "loss": 0.9222, + "step": 527 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 1.3526382446289062, + "learning_rate": 3.02775024777007e-06, + "loss": 0.9037, + "step": 528 + }, + { + "epoch": 0.9429590017825312, + "grad_norm": 1.479190707206726, + "learning_rate": 3.022794846382557e-06, + "loss": 0.9236, + "step": 529 + }, + { + "epoch": 0.9447415329768271, + "grad_norm": 1.3150599002838135, + "learning_rate": 3.017839444995045e-06, + "loss": 0.915, + "step": 530 + }, + { + "epoch": 0.9447415329768271, + "eval_loss": 1.0004721879959106, + "eval_runtime": 46.7447, + "eval_samples_per_second": 21.393, + "eval_steps_per_second": 1.348, + "step": 530 + }, + { + "epoch": 0.946524064171123, + "grad_norm": 1.5540575981140137, + "learning_rate": 3.0128840436075325e-06, + "loss": 0.9368, + "step": 531 + }, + { + "epoch": 0.948306595365419, + "grad_norm": 1.236002802848816, + "learning_rate": 3.0079286422200197e-06, + "loss": 0.9127, + "step": 532 + }, + { + "epoch": 0.9500891265597148, + "grad_norm": 1.1216799020767212, + "learning_rate": 3.0029732408325074e-06, + "loss": 0.9257, + "step": 533 + }, + { + "epoch": 0.9518716577540107, + "grad_norm": 1.2347314357757568, + "learning_rate": 2.9980178394449955e-06, + "loss": 0.9031, + "step": 534 + }, + { + "epoch": 0.9536541889483066, + "grad_norm": 1.4934622049331665, + "learning_rate": 2.993062438057483e-06, + "loss": 0.9392, + "step": 535 + }, + { + "epoch": 0.9554367201426025, + "grad_norm": 1.4701998233795166, + "learning_rate": 2.9881070366699704e-06, + "loss": 0.9553, + "step": 536 + }, + { + "epoch": 0.9572192513368984, + "grad_norm": 1.3633320331573486, + "learning_rate": 2.983151635282458e-06, + "loss": 0.9187, + "step": 537 + }, + { + "epoch": 0.9590017825311943, + "grad_norm": 1.3644108772277832, + "learning_rate": 2.9781962338949457e-06, + "loss": 0.8887, + "step": 538 + }, + { + "epoch": 0.9607843137254902, + "grad_norm": 1.3315613269805908, + "learning_rate": 2.9732408325074334e-06, + "loss": 0.9067, + "step": 539 + }, + { + "epoch": 0.9625668449197861, + "grad_norm": 1.1904367208480835, + "learning_rate": 2.968285431119921e-06, + "loss": 0.8795, + "step": 540 + }, + { + "epoch": 0.9625668449197861, + "eval_loss": 0.9979129433631897, + "eval_runtime": 46.8388, + "eval_samples_per_second": 21.35, + "eval_steps_per_second": 1.345, + "step": 540 + }, + { + "epoch": 0.964349376114082, + "grad_norm": 1.2745060920715332, + "learning_rate": 2.9633300297324087e-06, + "loss": 0.9501, + "step": 541 + }, + { + "epoch": 0.966131907308378, + "grad_norm": 1.2493999004364014, + "learning_rate": 2.958374628344896e-06, + "loss": 0.8889, + "step": 542 + }, + { + "epoch": 0.9679144385026738, + "grad_norm": 1.567963719367981, + "learning_rate": 2.9534192269573836e-06, + "loss": 0.9227, + "step": 543 + }, + { + "epoch": 0.9696969696969697, + "grad_norm": 1.4358129501342773, + "learning_rate": 2.9484638255698717e-06, + "loss": 0.9275, + "step": 544 + }, + { + "epoch": 0.9714795008912656, + "grad_norm": 1.5509508848190308, + "learning_rate": 2.9435084241823593e-06, + "loss": 0.9326, + "step": 545 + }, + { + "epoch": 0.9732620320855615, + "grad_norm": 1.381090760231018, + "learning_rate": 2.9385530227948466e-06, + "loss": 0.9265, + "step": 546 + }, + { + "epoch": 0.9750445632798574, + "grad_norm": 1.2564153671264648, + "learning_rate": 2.9335976214073342e-06, + "loss": 0.9274, + "step": 547 + }, + { + "epoch": 0.9768270944741533, + "grad_norm": 1.2519657611846924, + "learning_rate": 2.9286422200198215e-06, + "loss": 0.9072, + "step": 548 + }, + { + "epoch": 0.9786096256684492, + "grad_norm": 1.3204387426376343, + "learning_rate": 2.923686818632309e-06, + "loss": 0.9551, + "step": 549 + }, + { + "epoch": 0.9803921568627451, + "grad_norm": 1.4377044439315796, + "learning_rate": 2.9187314172447972e-06, + "loss": 0.9298, + "step": 550 + }, + { + "epoch": 0.9803921568627451, + "eval_loss": 0.9987896680831909, + "eval_runtime": 46.9021, + "eval_samples_per_second": 21.321, + "eval_steps_per_second": 1.343, + "step": 550 + }, + { + "epoch": 0.982174688057041, + "grad_norm": 1.2541097402572632, + "learning_rate": 2.913776015857285e-06, + "loss": 0.9214, + "step": 551 + }, + { + "epoch": 0.983957219251337, + "grad_norm": 1.2618757486343384, + "learning_rate": 2.908820614469772e-06, + "loss": 0.9441, + "step": 552 + }, + { + "epoch": 0.9857397504456328, + "grad_norm": 1.393532156944275, + "learning_rate": 2.9038652130822598e-06, + "loss": 0.9201, + "step": 553 + }, + { + "epoch": 0.9875222816399287, + "grad_norm": 1.4977319240570068, + "learning_rate": 2.8989098116947474e-06, + "loss": 0.8966, + "step": 554 + }, + { + "epoch": 0.9893048128342246, + "grad_norm": 1.5135530233383179, + "learning_rate": 2.8939544103072355e-06, + "loss": 0.9311, + "step": 555 + }, + { + "epoch": 0.9910873440285205, + "grad_norm": 1.2576534748077393, + "learning_rate": 2.8889990089197228e-06, + "loss": 0.9059, + "step": 556 + }, + { + "epoch": 0.9928698752228164, + "grad_norm": 1.3360989093780518, + "learning_rate": 2.8840436075322104e-06, + "loss": 0.9377, + "step": 557 + }, + { + "epoch": 0.9946524064171123, + "grad_norm": 1.4871883392333984, + "learning_rate": 2.8790882061446977e-06, + "loss": 0.8884, + "step": 558 + }, + { + "epoch": 0.9964349376114082, + "grad_norm": 1.3083568811416626, + "learning_rate": 2.8741328047571853e-06, + "loss": 0.9614, + "step": 559 + }, + { + "epoch": 0.9982174688057041, + "grad_norm": 1.3526344299316406, + "learning_rate": 2.8691774033696734e-06, + "loss": 0.9117, + "step": 560 + }, + { + "epoch": 0.9982174688057041, + "eval_loss": 0.9987173080444336, + "eval_runtime": 46.7749, + "eval_samples_per_second": 21.379, + "eval_steps_per_second": 1.347, + "step": 560 + }, + { + "epoch": 1.0, + "grad_norm": 1.3440344333648682, + "learning_rate": 2.864222001982161e-06, + "loss": 0.9054, + "step": 561 + }, + { + "epoch": 1.0017825311942958, + "grad_norm": 1.4617632627487183, + "learning_rate": 2.8592666005946483e-06, + "loss": 0.8767, + "step": 562 + }, + { + "epoch": 1.0035650623885919, + "grad_norm": 1.4687554836273193, + "learning_rate": 2.854311199207136e-06, + "loss": 0.8534, + "step": 563 + }, + { + "epoch": 1.0053475935828877, + "grad_norm": 1.4026336669921875, + "learning_rate": 2.8493557978196236e-06, + "loss": 0.8736, + "step": 564 + }, + { + "epoch": 1.0071301247771836, + "grad_norm": 1.4234068393707275, + "learning_rate": 2.844400396432111e-06, + "loss": 0.9045, + "step": 565 + }, + { + "epoch": 1.0089126559714796, + "grad_norm": 1.2911219596862793, + "learning_rate": 2.839444995044599e-06, + "loss": 0.8587, + "step": 566 + }, + { + "epoch": 1.0106951871657754, + "grad_norm": 1.4888267517089844, + "learning_rate": 2.8344895936570866e-06, + "loss": 0.8876, + "step": 567 + }, + { + "epoch": 1.0124777183600713, + "grad_norm": 1.3002138137817383, + "learning_rate": 2.829534192269574e-06, + "loss": 0.9234, + "step": 568 + }, + { + "epoch": 1.014260249554367, + "grad_norm": 1.3918858766555786, + "learning_rate": 2.8245787908820615e-06, + "loss": 0.8831, + "step": 569 + }, + { + "epoch": 1.0160427807486632, + "grad_norm": 1.284764051437378, + "learning_rate": 2.819623389494549e-06, + "loss": 0.9112, + "step": 570 + }, + { + "epoch": 1.0160427807486632, + "eval_loss": 0.9963147044181824, + "eval_runtime": 46.8613, + "eval_samples_per_second": 21.34, + "eval_steps_per_second": 1.344, + "step": 570 + }, + { + "epoch": 1.017825311942959, + "grad_norm": 1.4841303825378418, + "learning_rate": 2.8146679881070373e-06, + "loss": 0.8811, + "step": 571 + }, + { + "epoch": 1.0196078431372548, + "grad_norm": 1.4724706411361694, + "learning_rate": 2.8097125867195245e-06, + "loss": 0.8884, + "step": 572 + }, + { + "epoch": 1.0213903743315509, + "grad_norm": 1.5961391925811768, + "learning_rate": 2.804757185332012e-06, + "loss": 0.8845, + "step": 573 + }, + { + "epoch": 1.0231729055258467, + "grad_norm": 2.456153154373169, + "learning_rate": 2.7998017839445e-06, + "loss": 0.9175, + "step": 574 + }, + { + "epoch": 1.0249554367201426, + "grad_norm": 1.4625294208526611, + "learning_rate": 2.794846382556987e-06, + "loss": 0.883, + "step": 575 + }, + { + "epoch": 1.0267379679144386, + "grad_norm": 1.4096879959106445, + "learning_rate": 2.789890981169475e-06, + "loss": 0.9139, + "step": 576 + }, + { + "epoch": 1.0285204991087344, + "grad_norm": 1.606987714767456, + "learning_rate": 2.784935579781963e-06, + "loss": 0.9002, + "step": 577 + }, + { + "epoch": 1.0303030303030303, + "grad_norm": 2.5259013175964355, + "learning_rate": 2.77998017839445e-06, + "loss": 0.8221, + "step": 578 + }, + { + "epoch": 1.032085561497326, + "grad_norm": 1.870883822441101, + "learning_rate": 2.7750247770069377e-06, + "loss": 0.885, + "step": 579 + }, + { + "epoch": 1.0338680926916222, + "grad_norm": 1.4817109107971191, + "learning_rate": 2.7700693756194254e-06, + "loss": 0.872, + "step": 580 + }, + { + "epoch": 1.0338680926916222, + "eval_loss": 0.9969404935836792, + "eval_runtime": 46.8454, + "eval_samples_per_second": 21.347, + "eval_steps_per_second": 1.345, + "step": 580 + }, + { + "epoch": 1.035650623885918, + "grad_norm": 1.7585666179656982, + "learning_rate": 2.7651139742319126e-06, + "loss": 0.9138, + "step": 581 + }, + { + "epoch": 1.0374331550802138, + "grad_norm": 1.8026617765426636, + "learning_rate": 2.7601585728444007e-06, + "loss": 0.8799, + "step": 582 + }, + { + "epoch": 1.0392156862745099, + "grad_norm": 1.6723577976226807, + "learning_rate": 2.7552031714568884e-06, + "loss": 0.9084, + "step": 583 + }, + { + "epoch": 1.0409982174688057, + "grad_norm": 1.8082846403121948, + "learning_rate": 2.750247770069376e-06, + "loss": 0.8649, + "step": 584 + }, + { + "epoch": 1.0427807486631016, + "grad_norm": 1.685368299484253, + "learning_rate": 2.7452923686818633e-06, + "loss": 0.9086, + "step": 585 + }, + { + "epoch": 1.0445632798573976, + "grad_norm": 1.6961652040481567, + "learning_rate": 2.740336967294351e-06, + "loss": 0.8912, + "step": 586 + }, + { + "epoch": 1.0463458110516934, + "grad_norm": 1.55865478515625, + "learning_rate": 2.735381565906839e-06, + "loss": 0.8864, + "step": 587 + }, + { + "epoch": 1.0481283422459893, + "grad_norm": 1.8303790092468262, + "learning_rate": 2.7304261645193263e-06, + "loss": 0.8801, + "step": 588 + }, + { + "epoch": 1.049910873440285, + "grad_norm": 1.6978795528411865, + "learning_rate": 2.725470763131814e-06, + "loss": 0.8463, + "step": 589 + }, + { + "epoch": 1.0516934046345812, + "grad_norm": 1.6878262758255005, + "learning_rate": 2.7205153617443016e-06, + "loss": 0.8927, + "step": 590 + }, + { + "epoch": 1.0516934046345812, + "eval_loss": 0.9930296540260315, + "eval_runtime": 46.8922, + "eval_samples_per_second": 21.326, + "eval_steps_per_second": 1.344, + "step": 590 + }, + { + "epoch": 1.053475935828877, + "grad_norm": 1.3838971853256226, + "learning_rate": 2.715559960356789e-06, + "loss": 0.9149, + "step": 591 + }, + { + "epoch": 1.0552584670231728, + "grad_norm": 1.689635992050171, + "learning_rate": 2.710604558969277e-06, + "loss": 0.893, + "step": 592 + }, + { + "epoch": 1.0570409982174689, + "grad_norm": 1.940187692642212, + "learning_rate": 2.7056491575817646e-06, + "loss": 0.8938, + "step": 593 + }, + { + "epoch": 1.0588235294117647, + "grad_norm": 1.5956275463104248, + "learning_rate": 2.700693756194252e-06, + "loss": 0.8764, + "step": 594 + }, + { + "epoch": 1.0606060606060606, + "grad_norm": 1.4798537492752075, + "learning_rate": 2.6957383548067395e-06, + "loss": 0.8709, + "step": 595 + }, + { + "epoch": 1.0623885918003566, + "grad_norm": 1.2672936916351318, + "learning_rate": 2.690782953419227e-06, + "loss": 0.8538, + "step": 596 + }, + { + "epoch": 1.0641711229946524, + "grad_norm": 1.2904959917068481, + "learning_rate": 2.6858275520317144e-06, + "loss": 0.8187, + "step": 597 + }, + { + "epoch": 1.0659536541889483, + "grad_norm": 1.4401811361312866, + "learning_rate": 2.6808721506442025e-06, + "loss": 0.8775, + "step": 598 + }, + { + "epoch": 1.067736185383244, + "grad_norm": 1.3092564344406128, + "learning_rate": 2.67591674925669e-06, + "loss": 0.9057, + "step": 599 + }, + { + "epoch": 1.0695187165775402, + "grad_norm": 1.6517740488052368, + "learning_rate": 2.6709613478691778e-06, + "loss": 0.883, + "step": 600 + }, + { + "epoch": 1.0695187165775402, + "eval_loss": 0.9936122298240662, + "eval_runtime": 46.824, + "eval_samples_per_second": 21.357, + "eval_steps_per_second": 1.345, + "step": 600 + }, + { + "epoch": 1.071301247771836, + "grad_norm": 1.5126500129699707, + "learning_rate": 2.666005946481665e-06, + "loss": 0.8645, + "step": 601 + }, + { + "epoch": 1.0730837789661318, + "grad_norm": 1.408370852470398, + "learning_rate": 2.6610505450941527e-06, + "loss": 0.8944, + "step": 602 + }, + { + "epoch": 1.0748663101604279, + "grad_norm": 1.4243714809417725, + "learning_rate": 2.6560951437066408e-06, + "loss": 0.8935, + "step": 603 + }, + { + "epoch": 1.0766488413547237, + "grad_norm": 1.4721286296844482, + "learning_rate": 2.651139742319128e-06, + "loss": 0.8808, + "step": 604 + }, + { + "epoch": 1.0784313725490196, + "grad_norm": 1.4304018020629883, + "learning_rate": 2.6461843409316157e-06, + "loss": 0.8792, + "step": 605 + }, + { + "epoch": 1.0802139037433156, + "grad_norm": 1.4544298648834229, + "learning_rate": 2.6412289395441033e-06, + "loss": 0.877, + "step": 606 + }, + { + "epoch": 1.0819964349376114, + "grad_norm": 1.409246802330017, + "learning_rate": 2.6362735381565906e-06, + "loss": 0.8757, + "step": 607 + }, + { + "epoch": 1.0837789661319073, + "grad_norm": 1.617339015007019, + "learning_rate": 2.6313181367690786e-06, + "loss": 0.8767, + "step": 608 + }, + { + "epoch": 1.085561497326203, + "grad_norm": 1.5547486543655396, + "learning_rate": 2.6263627353815663e-06, + "loss": 0.8893, + "step": 609 + }, + { + "epoch": 1.0873440285204992, + "grad_norm": 1.579556941986084, + "learning_rate": 2.621407333994054e-06, + "loss": 0.9028, + "step": 610 + }, + { + "epoch": 1.0873440285204992, + "eval_loss": 0.9923062920570374, + "eval_runtime": 46.7842, + "eval_samples_per_second": 21.375, + "eval_steps_per_second": 1.347, + "step": 610 + }, + { + "epoch": 1.089126559714795, + "grad_norm": 1.847340703010559, + "learning_rate": 2.616451932606541e-06, + "loss": 0.8685, + "step": 611 + }, + { + "epoch": 1.0909090909090908, + "grad_norm": 1.5406112670898438, + "learning_rate": 2.611496531219029e-06, + "loss": 0.9117, + "step": 612 + }, + { + "epoch": 1.0926916221033869, + "grad_norm": 1.3770501613616943, + "learning_rate": 2.606541129831516e-06, + "loss": 0.8371, + "step": 613 + }, + { + "epoch": 1.0944741532976827, + "grad_norm": 1.3167601823806763, + "learning_rate": 2.601585728444004e-06, + "loss": 0.8927, + "step": 614 + }, + { + "epoch": 1.0962566844919786, + "grad_norm": 1.4795807600021362, + "learning_rate": 2.596630327056492e-06, + "loss": 0.8994, + "step": 615 + }, + { + "epoch": 1.0980392156862746, + "grad_norm": 1.7171162366867065, + "learning_rate": 2.5916749256689795e-06, + "loss": 0.877, + "step": 616 + }, + { + "epoch": 1.0998217468805704, + "grad_norm": 1.4922057390213013, + "learning_rate": 2.5867195242814668e-06, + "loss": 0.8555, + "step": 617 + }, + { + "epoch": 1.1016042780748663, + "grad_norm": 1.5104916095733643, + "learning_rate": 2.5817641228939544e-06, + "loss": 0.8668, + "step": 618 + }, + { + "epoch": 1.1033868092691623, + "grad_norm": 1.3570868968963623, + "learning_rate": 2.5768087215064425e-06, + "loss": 0.9149, + "step": 619 + }, + { + "epoch": 1.1051693404634582, + "grad_norm": 1.4930638074874878, + "learning_rate": 2.57185332011893e-06, + "loss": 0.8967, + "step": 620 + }, + { + "epoch": 1.1051693404634582, + "eval_loss": 0.9928249716758728, + "eval_runtime": 46.7413, + "eval_samples_per_second": 21.394, + "eval_steps_per_second": 1.348, + "step": 620 + }, + { + "epoch": 1.106951871657754, + "grad_norm": 1.3319581747055054, + "learning_rate": 2.5668979187314174e-06, + "loss": 0.8868, + "step": 621 + }, + { + "epoch": 1.1087344028520498, + "grad_norm": 1.4563696384429932, + "learning_rate": 2.561942517343905e-06, + "loss": 0.8676, + "step": 622 + }, + { + "epoch": 1.1105169340463459, + "grad_norm": 1.5489044189453125, + "learning_rate": 2.5569871159563923e-06, + "loss": 0.9186, + "step": 623 + }, + { + "epoch": 1.1122994652406417, + "grad_norm": 1.4728941917419434, + "learning_rate": 2.5520317145688804e-06, + "loss": 0.8981, + "step": 624 + }, + { + "epoch": 1.1140819964349375, + "grad_norm": 1.4518868923187256, + "learning_rate": 2.547076313181368e-06, + "loss": 0.8867, + "step": 625 + }, + { + "epoch": 1.1158645276292336, + "grad_norm": 1.464890480041504, + "learning_rate": 2.5421209117938557e-06, + "loss": 0.9466, + "step": 626 + }, + { + "epoch": 1.1176470588235294, + "grad_norm": 1.4894181489944458, + "learning_rate": 2.537165510406343e-06, + "loss": 0.8671, + "step": 627 + }, + { + "epoch": 1.1194295900178253, + "grad_norm": 1.6478488445281982, + "learning_rate": 2.5322101090188306e-06, + "loss": 0.8847, + "step": 628 + }, + { + "epoch": 1.121212121212121, + "grad_norm": 1.465164065361023, + "learning_rate": 2.5272547076313183e-06, + "loss": 0.9166, + "step": 629 + }, + { + "epoch": 1.1229946524064172, + "grad_norm": 1.3165429830551147, + "learning_rate": 2.522299306243806e-06, + "loss": 0.8853, + "step": 630 + }, + { + "epoch": 1.1229946524064172, + "eval_loss": 0.99163818359375, + "eval_runtime": 46.7506, + "eval_samples_per_second": 21.39, + "eval_steps_per_second": 1.348, + "step": 630 + }, + { + "epoch": 1.124777183600713, + "grad_norm": 1.6656804084777832, + "learning_rate": 2.5173439048562936e-06, + "loss": 0.8526, + "step": 631 + }, + { + "epoch": 1.1265597147950088, + "grad_norm": 1.3749207258224487, + "learning_rate": 2.5123885034687813e-06, + "loss": 0.9135, + "step": 632 + }, + { + "epoch": 1.1283422459893049, + "grad_norm": 1.4695217609405518, + "learning_rate": 2.5074331020812685e-06, + "loss": 0.8788, + "step": 633 + }, + { + "epoch": 1.1301247771836007, + "grad_norm": 1.4157956838607788, + "learning_rate": 2.502477700693756e-06, + "loss": 0.8811, + "step": 634 + }, + { + "epoch": 1.1319073083778965, + "grad_norm": 1.3357295989990234, + "learning_rate": 2.497522299306244e-06, + "loss": 0.8767, + "step": 635 + }, + { + "epoch": 1.1336898395721926, + "grad_norm": 1.7020845413208008, + "learning_rate": 2.492566897918732e-06, + "loss": 0.8714, + "step": 636 + }, + { + "epoch": 1.1354723707664884, + "grad_norm": 1.4703397750854492, + "learning_rate": 2.487611496531219e-06, + "loss": 0.8921, + "step": 637 + }, + { + "epoch": 1.1372549019607843, + "grad_norm": 2.017592430114746, + "learning_rate": 2.482656095143707e-06, + "loss": 0.865, + "step": 638 + }, + { + "epoch": 1.1390374331550803, + "grad_norm": 1.4744309186935425, + "learning_rate": 2.4777006937561945e-06, + "loss": 0.8894, + "step": 639 + }, + { + "epoch": 1.1408199643493762, + "grad_norm": 1.4797626733779907, + "learning_rate": 2.472745292368682e-06, + "loss": 0.8839, + "step": 640 + }, + { + "epoch": 1.1408199643493762, + "eval_loss": 0.9916447401046753, + "eval_runtime": 46.6435, + "eval_samples_per_second": 21.439, + "eval_steps_per_second": 1.351, + "step": 640 + }, + { + "epoch": 1.142602495543672, + "grad_norm": 1.2910147905349731, + "learning_rate": 2.4677898909811694e-06, + "loss": 0.8608, + "step": 641 + }, + { + "epoch": 1.1443850267379678, + "grad_norm": 1.276078701019287, + "learning_rate": 2.4628344895936575e-06, + "loss": 0.8723, + "step": 642 + }, + { + "epoch": 1.1461675579322639, + "grad_norm": 1.7418657541275024, + "learning_rate": 2.4578790882061447e-06, + "loss": 0.8726, + "step": 643 + }, + { + "epoch": 1.1479500891265597, + "grad_norm": 1.5978609323501587, + "learning_rate": 2.4529236868186328e-06, + "loss": 0.867, + "step": 644 + }, + { + "epoch": 1.1497326203208555, + "grad_norm": 1.3553117513656616, + "learning_rate": 2.44796828543112e-06, + "loss": 0.8993, + "step": 645 + }, + { + "epoch": 1.1515151515151516, + "grad_norm": 1.5411593914031982, + "learning_rate": 2.4430128840436077e-06, + "loss": 0.8556, + "step": 646 + }, + { + "epoch": 1.1532976827094474, + "grad_norm": 1.4576947689056396, + "learning_rate": 2.4380574826560953e-06, + "loss": 0.8889, + "step": 647 + }, + { + "epoch": 1.1550802139037433, + "grad_norm": 1.4621020555496216, + "learning_rate": 2.433102081268583e-06, + "loss": 0.8586, + "step": 648 + }, + { + "epoch": 1.156862745098039, + "grad_norm": 1.409480094909668, + "learning_rate": 2.4281466798810702e-06, + "loss": 0.8656, + "step": 649 + }, + { + "epoch": 1.1586452762923352, + "grad_norm": 1.461784839630127, + "learning_rate": 2.4231912784935583e-06, + "loss": 0.9129, + "step": 650 + }, + { + "epoch": 1.1586452762923352, + "eval_loss": 0.9930744767189026, + "eval_runtime": 46.6771, + "eval_samples_per_second": 21.424, + "eval_steps_per_second": 1.35, + "step": 650 + }, + { + "epoch": 1.160427807486631, + "grad_norm": 1.2690680027008057, + "learning_rate": 2.4182358771060456e-06, + "loss": 0.8445, + "step": 651 + }, + { + "epoch": 1.1622103386809268, + "grad_norm": 1.6523184776306152, + "learning_rate": 2.4132804757185337e-06, + "loss": 0.8714, + "step": 652 + }, + { + "epoch": 1.1639928698752229, + "grad_norm": 1.4627327919006348, + "learning_rate": 2.408325074331021e-06, + "loss": 0.8907, + "step": 653 + }, + { + "epoch": 1.1657754010695187, + "grad_norm": 1.3082153797149658, + "learning_rate": 2.4033696729435086e-06, + "loss": 0.8873, + "step": 654 + }, + { + "epoch": 1.1675579322638145, + "grad_norm": 1.4305182695388794, + "learning_rate": 2.3984142715559962e-06, + "loss": 0.9341, + "step": 655 + }, + { + "epoch": 1.1693404634581106, + "grad_norm": 1.3877581357955933, + "learning_rate": 2.393458870168484e-06, + "loss": 0.834, + "step": 656 + }, + { + "epoch": 1.1711229946524064, + "grad_norm": 1.5337421894073486, + "learning_rate": 2.3885034687809715e-06, + "loss": 0.8588, + "step": 657 + }, + { + "epoch": 1.1729055258467023, + "grad_norm": 1.6537212133407593, + "learning_rate": 2.383548067393459e-06, + "loss": 0.8761, + "step": 658 + }, + { + "epoch": 1.1746880570409983, + "grad_norm": 1.5609780550003052, + "learning_rate": 2.3785926660059464e-06, + "loss": 0.88, + "step": 659 + }, + { + "epoch": 1.1764705882352942, + "grad_norm": 1.4873936176300049, + "learning_rate": 2.3736372646184345e-06, + "loss": 0.8904, + "step": 660 + }, + { + "epoch": 1.1764705882352942, + "eval_loss": 0.9909861087799072, + "eval_runtime": 46.6989, + "eval_samples_per_second": 21.414, + "eval_steps_per_second": 1.349, + "step": 660 + }, + { + "epoch": 1.17825311942959, + "grad_norm": 1.471859097480774, + "learning_rate": 2.3686818632309218e-06, + "loss": 0.8774, + "step": 661 + }, + { + "epoch": 1.1800356506238858, + "grad_norm": 1.4559658765792847, + "learning_rate": 2.3637264618434094e-06, + "loss": 0.8649, + "step": 662 + }, + { + "epoch": 1.1818181818181819, + "grad_norm": 1.5359556674957275, + "learning_rate": 2.358771060455897e-06, + "loss": 0.8953, + "step": 663 + }, + { + "epoch": 1.1836007130124777, + "grad_norm": 1.2817641496658325, + "learning_rate": 2.3538156590683847e-06, + "loss": 0.8987, + "step": 664 + }, + { + "epoch": 1.1853832442067735, + "grad_norm": 1.2599672079086304, + "learning_rate": 2.3488602576808724e-06, + "loss": 0.8735, + "step": 665 + }, + { + "epoch": 1.1871657754010696, + "grad_norm": 1.341021180152893, + "learning_rate": 2.34390485629336e-06, + "loss": 0.8981, + "step": 666 + }, + { + "epoch": 1.1889483065953654, + "grad_norm": 1.4567780494689941, + "learning_rate": 2.3389494549058473e-06, + "loss": 0.9018, + "step": 667 + }, + { + "epoch": 1.1907308377896613, + "grad_norm": 1.7645756006240845, + "learning_rate": 2.3339940535183354e-06, + "loss": 0.8856, + "step": 668 + }, + { + "epoch": 1.192513368983957, + "grad_norm": 1.4657479524612427, + "learning_rate": 2.3290386521308226e-06, + "loss": 0.9083, + "step": 669 + }, + { + "epoch": 1.1942959001782532, + "grad_norm": 1.269451379776001, + "learning_rate": 2.3240832507433103e-06, + "loss": 0.887, + "step": 670 + }, + { + "epoch": 1.1942959001782532, + "eval_loss": 0.9889793992042542, + "eval_runtime": 46.8047, + "eval_samples_per_second": 21.365, + "eval_steps_per_second": 1.346, + "step": 670 + }, + { + "epoch": 1.196078431372549, + "grad_norm": 1.3132225275039673, + "learning_rate": 2.319127849355798e-06, + "loss": 0.8676, + "step": 671 + }, + { + "epoch": 1.1978609625668448, + "grad_norm": 1.3520387411117554, + "learning_rate": 2.3141724479682856e-06, + "loss": 0.9, + "step": 672 + }, + { + "epoch": 1.1996434937611409, + "grad_norm": 1.392001986503601, + "learning_rate": 2.3092170465807733e-06, + "loss": 0.8749, + "step": 673 + }, + { + "epoch": 1.2014260249554367, + "grad_norm": 1.4109132289886475, + "learning_rate": 2.304261645193261e-06, + "loss": 0.9133, + "step": 674 + }, + { + "epoch": 1.2032085561497325, + "grad_norm": 1.6250007152557373, + "learning_rate": 2.2993062438057486e-06, + "loss": 0.8689, + "step": 675 + }, + { + "epoch": 1.2049910873440286, + "grad_norm": 1.4920040369033813, + "learning_rate": 2.2943508424182363e-06, + "loss": 0.9031, + "step": 676 + }, + { + "epoch": 1.2067736185383244, + "grad_norm": 1.2313125133514404, + "learning_rate": 2.2893954410307235e-06, + "loss": 0.8538, + "step": 677 + }, + { + "epoch": 1.2085561497326203, + "grad_norm": 1.5707231760025024, + "learning_rate": 2.284440039643211e-06, + "loss": 0.9009, + "step": 678 + }, + { + "epoch": 1.2103386809269163, + "grad_norm": 1.4293330907821655, + "learning_rate": 2.279484638255699e-06, + "loss": 0.8994, + "step": 679 + }, + { + "epoch": 1.2121212121212122, + "grad_norm": 1.404422640800476, + "learning_rate": 2.2745292368681865e-06, + "loss": 0.9259, + "step": 680 + }, + { + "epoch": 1.2121212121212122, + "eval_loss": 0.9861985445022583, + "eval_runtime": 46.7287, + "eval_samples_per_second": 21.4, + "eval_steps_per_second": 1.348, + "step": 680 + }, + { + "epoch": 1.213903743315508, + "grad_norm": 1.2691755294799805, + "learning_rate": 2.269573835480674e-06, + "loss": 0.8818, + "step": 681 + }, + { + "epoch": 1.215686274509804, + "grad_norm": 1.478243350982666, + "learning_rate": 2.264618434093162e-06, + "loss": 0.86, + "step": 682 + }, + { + "epoch": 1.2174688057040999, + "grad_norm": 1.398539662361145, + "learning_rate": 2.2596630327056495e-06, + "loss": 0.8802, + "step": 683 + }, + { + "epoch": 1.2192513368983957, + "grad_norm": 1.4929214715957642, + "learning_rate": 2.254707631318137e-06, + "loss": 0.8576, + "step": 684 + }, + { + "epoch": 1.2210338680926915, + "grad_norm": 1.4856244325637817, + "learning_rate": 2.2497522299306244e-06, + "loss": 0.8893, + "step": 685 + }, + { + "epoch": 1.2228163992869876, + "grad_norm": 1.3881354331970215, + "learning_rate": 2.244796828543112e-06, + "loss": 0.866, + "step": 686 + }, + { + "epoch": 1.2245989304812834, + "grad_norm": 1.413635492324829, + "learning_rate": 2.2398414271555997e-06, + "loss": 0.8584, + "step": 687 + }, + { + "epoch": 1.2263814616755793, + "grad_norm": 1.5331799983978271, + "learning_rate": 2.2348860257680874e-06, + "loss": 0.8614, + "step": 688 + }, + { + "epoch": 1.228163992869875, + "grad_norm": 1.396995186805725, + "learning_rate": 2.229930624380575e-06, + "loss": 0.9054, + "step": 689 + }, + { + "epoch": 1.2299465240641712, + "grad_norm": 1.6812461614608765, + "learning_rate": 2.2249752229930627e-06, + "loss": 0.8831, + "step": 690 + }, + { + "epoch": 1.2299465240641712, + "eval_loss": 0.9851806163787842, + "eval_runtime": 46.6927, + "eval_samples_per_second": 21.417, + "eval_steps_per_second": 1.349, + "step": 690 + }, + { + "epoch": 1.231729055258467, + "grad_norm": 1.4893524646759033, + "learning_rate": 2.2200198216055503e-06, + "loss": 0.8903, + "step": 691 + }, + { + "epoch": 1.2335115864527628, + "grad_norm": 1.5997936725616455, + "learning_rate": 2.215064420218038e-06, + "loss": 0.8972, + "step": 692 + }, + { + "epoch": 1.2352941176470589, + "grad_norm": 1.4399421215057373, + "learning_rate": 2.2101090188305257e-06, + "loss": 0.8639, + "step": 693 + }, + { + "epoch": 1.2370766488413547, + "grad_norm": 1.5507882833480835, + "learning_rate": 2.205153617443013e-06, + "loss": 0.8558, + "step": 694 + }, + { + "epoch": 1.2388591800356505, + "grad_norm": 1.3248441219329834, + "learning_rate": 2.2001982160555006e-06, + "loss": 0.8699, + "step": 695 + }, + { + "epoch": 1.2406417112299466, + "grad_norm": 1.4937480688095093, + "learning_rate": 2.1952428146679882e-06, + "loss": 0.8935, + "step": 696 + }, + { + "epoch": 1.2424242424242424, + "grad_norm": 1.5931757688522339, + "learning_rate": 2.190287413280476e-06, + "loss": 0.9007, + "step": 697 + }, + { + "epoch": 1.2442067736185383, + "grad_norm": 1.7570197582244873, + "learning_rate": 2.1853320118929636e-06, + "loss": 0.8546, + "step": 698 + }, + { + "epoch": 1.2459893048128343, + "grad_norm": 1.621984601020813, + "learning_rate": 2.1803766105054512e-06, + "loss": 0.901, + "step": 699 + }, + { + "epoch": 1.2477718360071302, + "grad_norm": 1.3966022729873657, + "learning_rate": 2.175421209117939e-06, + "loss": 0.9307, + "step": 700 + }, + { + "epoch": 1.2477718360071302, + "eval_loss": 0.9874935150146484, + "eval_runtime": 46.6795, + "eval_samples_per_second": 21.423, + "eval_steps_per_second": 1.35, + "step": 700 + }, + { + "epoch": 1.249554367201426, + "grad_norm": 1.3309043645858765, + "learning_rate": 2.1704658077304265e-06, + "loss": 0.9206, + "step": 701 + }, + { + "epoch": 1.251336898395722, + "grad_norm": 1.3344178199768066, + "learning_rate": 2.1655104063429138e-06, + "loss": 0.868, + "step": 702 + }, + { + "epoch": 1.2531194295900179, + "grad_norm": 1.3813674449920654, + "learning_rate": 2.160555004955402e-06, + "loss": 0.8573, + "step": 703 + }, + { + "epoch": 1.2549019607843137, + "grad_norm": 1.44670832157135, + "learning_rate": 2.155599603567889e-06, + "loss": 0.8964, + "step": 704 + }, + { + "epoch": 1.2566844919786098, + "grad_norm": 1.732297420501709, + "learning_rate": 2.1506442021803768e-06, + "loss": 0.8635, + "step": 705 + }, + { + "epoch": 1.2584670231729056, + "grad_norm": 1.3183728456497192, + "learning_rate": 2.1456888007928644e-06, + "loss": 0.8917, + "step": 706 + }, + { + "epoch": 1.2602495543672014, + "grad_norm": 1.4485280513763428, + "learning_rate": 2.140733399405352e-06, + "loss": 0.8731, + "step": 707 + }, + { + "epoch": 1.2620320855614973, + "grad_norm": 1.4543678760528564, + "learning_rate": 2.1357779980178398e-06, + "loss": 0.9099, + "step": 708 + }, + { + "epoch": 1.263814616755793, + "grad_norm": 1.3342758417129517, + "learning_rate": 2.1308225966303274e-06, + "loss": 0.8704, + "step": 709 + }, + { + "epoch": 1.2655971479500892, + "grad_norm": 1.6935384273529053, + "learning_rate": 2.1258671952428147e-06, + "loss": 0.8775, + "step": 710 + }, + { + "epoch": 1.2655971479500892, + "eval_loss": 0.989654541015625, + "eval_runtime": 46.5841, + "eval_samples_per_second": 21.467, + "eval_steps_per_second": 1.352, + "step": 710 + }, + { + "epoch": 1.267379679144385, + "grad_norm": 1.6754915714263916, + "learning_rate": 2.1209117938553027e-06, + "loss": 0.8715, + "step": 711 + }, + { + "epoch": 1.2691622103386808, + "grad_norm": 1.4536867141723633, + "learning_rate": 2.11595639246779e-06, + "loss": 0.8805, + "step": 712 + }, + { + "epoch": 1.2709447415329769, + "grad_norm": 1.46432626247406, + "learning_rate": 2.1110009910802776e-06, + "loss": 0.898, + "step": 713 + }, + { + "epoch": 1.2727272727272727, + "grad_norm": 1.3130420446395874, + "learning_rate": 2.1060455896927653e-06, + "loss": 0.8851, + "step": 714 + }, + { + "epoch": 1.2745098039215685, + "grad_norm": 1.454445242881775, + "learning_rate": 2.101090188305253e-06, + "loss": 0.9093, + "step": 715 + }, + { + "epoch": 1.2762923351158646, + "grad_norm": 1.560789942741394, + "learning_rate": 2.0961347869177406e-06, + "loss": 0.8874, + "step": 716 + }, + { + "epoch": 1.2780748663101604, + "grad_norm": 1.632129430770874, + "learning_rate": 2.0911793855302283e-06, + "loss": 0.8691, + "step": 717 + }, + { + "epoch": 1.2798573975044563, + "grad_norm": 1.4793459177017212, + "learning_rate": 2.0862239841427155e-06, + "loss": 0.8766, + "step": 718 + }, + { + "epoch": 1.2816399286987523, + "grad_norm": 1.4158271551132202, + "learning_rate": 2.0812685827552036e-06, + "loss": 0.8646, + "step": 719 + }, + { + "epoch": 1.2834224598930482, + "grad_norm": 1.508212924003601, + "learning_rate": 2.076313181367691e-06, + "loss": 0.8856, + "step": 720 + }, + { + "epoch": 1.2834224598930482, + "eval_loss": 0.9860122799873352, + "eval_runtime": 46.8096, + "eval_samples_per_second": 21.363, + "eval_steps_per_second": 1.346, + "step": 720 + }, + { + "epoch": 1.285204991087344, + "grad_norm": 1.4124470949172974, + "learning_rate": 2.0713577799801785e-06, + "loss": 0.8816, + "step": 721 + }, + { + "epoch": 1.28698752228164, + "grad_norm": 1.4536359310150146, + "learning_rate": 2.066402378592666e-06, + "loss": 0.8729, + "step": 722 + }, + { + "epoch": 1.2887700534759359, + "grad_norm": 1.6018034219741821, + "learning_rate": 2.061446977205154e-06, + "loss": 0.8749, + "step": 723 + }, + { + "epoch": 1.2905525846702317, + "grad_norm": 1.3637772798538208, + "learning_rate": 2.0564915758176415e-06, + "loss": 0.8483, + "step": 724 + }, + { + "epoch": 1.2923351158645278, + "grad_norm": 1.6507648229599, + "learning_rate": 2.051536174430129e-06, + "loss": 0.8598, + "step": 725 + }, + { + "epoch": 1.2941176470588236, + "grad_norm": 1.37489652633667, + "learning_rate": 2.0465807730426164e-06, + "loss": 0.8736, + "step": 726 + }, + { + "epoch": 1.2959001782531194, + "grad_norm": 1.2748007774353027, + "learning_rate": 2.0416253716551045e-06, + "loss": 0.863, + "step": 727 + }, + { + "epoch": 1.2976827094474153, + "grad_norm": 1.4101521968841553, + "learning_rate": 2.0366699702675917e-06, + "loss": 0.864, + "step": 728 + }, + { + "epoch": 1.299465240641711, + "grad_norm": 1.4258012771606445, + "learning_rate": 2.0317145688800794e-06, + "loss": 0.9237, + "step": 729 + }, + { + "epoch": 1.3012477718360071, + "grad_norm": 1.3425439596176147, + "learning_rate": 2.026759167492567e-06, + "loss": 0.8798, + "step": 730 + }, + { + "epoch": 1.3012477718360071, + "eval_loss": 0.9809737801551819, + "eval_runtime": 47.0146, + "eval_samples_per_second": 21.27, + "eval_steps_per_second": 1.34, + "step": 730 + }, + { + "epoch": 1.303030303030303, + "grad_norm": 1.4874835014343262, + "learning_rate": 2.0218037661050547e-06, + "loss": 0.8953, + "step": 731 + }, + { + "epoch": 1.3048128342245988, + "grad_norm": 1.3893389701843262, + "learning_rate": 2.0168483647175424e-06, + "loss": 0.8579, + "step": 732 + }, + { + "epoch": 1.3065953654188949, + "grad_norm": 1.4454811811447144, + "learning_rate": 2.01189296333003e-06, + "loss": 0.865, + "step": 733 + }, + { + "epoch": 1.3083778966131907, + "grad_norm": 1.3343634605407715, + "learning_rate": 2.0069375619425173e-06, + "loss": 0.8737, + "step": 734 + }, + { + "epoch": 1.3101604278074865, + "grad_norm": 1.5064631700515747, + "learning_rate": 2.0019821605550054e-06, + "loss": 0.8625, + "step": 735 + }, + { + "epoch": 1.3119429590017826, + "grad_norm": 1.5989820957183838, + "learning_rate": 1.9970267591674926e-06, + "loss": 0.8926, + "step": 736 + }, + { + "epoch": 1.3137254901960784, + "grad_norm": 1.5042405128479004, + "learning_rate": 1.9920713577799803e-06, + "loss": 0.8646, + "step": 737 + }, + { + "epoch": 1.3155080213903743, + "grad_norm": 1.463318109512329, + "learning_rate": 1.987115956392468e-06, + "loss": 0.8918, + "step": 738 + }, + { + "epoch": 1.3172905525846703, + "grad_norm": 1.477638602256775, + "learning_rate": 1.9821605550049556e-06, + "loss": 0.8414, + "step": 739 + }, + { + "epoch": 1.3190730837789661, + "grad_norm": 1.4441951513290405, + "learning_rate": 1.9772051536174432e-06, + "loss": 0.8728, + "step": 740 + }, + { + "epoch": 1.3190730837789661, + "eval_loss": 0.9812989830970764, + "eval_runtime": 46.879, + "eval_samples_per_second": 21.332, + "eval_steps_per_second": 1.344, + "step": 740 + }, + { + "epoch": 1.320855614973262, + "grad_norm": 1.275374174118042, + "learning_rate": 1.972249752229931e-06, + "loss": 0.8911, + "step": 741 + }, + { + "epoch": 1.322638146167558, + "grad_norm": 1.400856614112854, + "learning_rate": 1.967294350842418e-06, + "loss": 0.8334, + "step": 742 + }, + { + "epoch": 1.3244206773618539, + "grad_norm": 1.4594979286193848, + "learning_rate": 1.9623389494549062e-06, + "loss": 0.8453, + "step": 743 + }, + { + "epoch": 1.3262032085561497, + "grad_norm": 1.2328113317489624, + "learning_rate": 1.9573835480673935e-06, + "loss": 0.8652, + "step": 744 + }, + { + "epoch": 1.3279857397504458, + "grad_norm": 1.5369375944137573, + "learning_rate": 1.952428146679881e-06, + "loss": 0.8884, + "step": 745 + }, + { + "epoch": 1.3297682709447416, + "grad_norm": 1.4707444906234741, + "learning_rate": 1.9474727452923688e-06, + "loss": 0.8811, + "step": 746 + }, + { + "epoch": 1.3315508021390374, + "grad_norm": 1.4895271062850952, + "learning_rate": 1.9425173439048564e-06, + "loss": 0.8704, + "step": 747 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 1.413124918937683, + "learning_rate": 1.937561942517344e-06, + "loss": 0.8407, + "step": 748 + }, + { + "epoch": 1.3351158645276293, + "grad_norm": 1.322543740272522, + "learning_rate": 1.9326065411298318e-06, + "loss": 0.8429, + "step": 749 + }, + { + "epoch": 1.3368983957219251, + "grad_norm": 1.3666902780532837, + "learning_rate": 1.927651139742319e-06, + "loss": 0.8587, + "step": 750 + }, + { + "epoch": 1.3368983957219251, + "eval_loss": 0.9815234541893005, + "eval_runtime": 46.813, + "eval_samples_per_second": 21.362, + "eval_steps_per_second": 1.346, + "step": 750 + }, + { + "epoch": 1.338680926916221, + "grad_norm": 1.336299180984497, + "learning_rate": 1.922695738354807e-06, + "loss": 0.8528, + "step": 751 + }, + { + "epoch": 1.3404634581105168, + "grad_norm": 1.4908182621002197, + "learning_rate": 1.9177403369672943e-06, + "loss": 0.8841, + "step": 752 + }, + { + "epoch": 1.3422459893048129, + "grad_norm": 1.2383705377578735, + "learning_rate": 1.912784935579782e-06, + "loss": 0.8387, + "step": 753 + }, + { + "epoch": 1.3440285204991087, + "grad_norm": 1.4624556303024292, + "learning_rate": 1.9078295341922697e-06, + "loss": 0.891, + "step": 754 + }, + { + "epoch": 1.3458110516934045, + "grad_norm": 1.4606925249099731, + "learning_rate": 1.9028741328047573e-06, + "loss": 0.8913, + "step": 755 + }, + { + "epoch": 1.3475935828877006, + "grad_norm": 1.403260350227356, + "learning_rate": 1.897918731417245e-06, + "loss": 0.8486, + "step": 756 + }, + { + "epoch": 1.3493761140819964, + "grad_norm": 1.407162070274353, + "learning_rate": 1.8929633300297326e-06, + "loss": 0.8588, + "step": 757 + }, + { + "epoch": 1.3511586452762923, + "grad_norm": 1.468679666519165, + "learning_rate": 1.88800792864222e-06, + "loss": 0.8619, + "step": 758 + }, + { + "epoch": 1.3529411764705883, + "grad_norm": 1.3879525661468506, + "learning_rate": 1.883052527254708e-06, + "loss": 0.8706, + "step": 759 + }, + { + "epoch": 1.3547237076648841, + "grad_norm": 1.4178467988967896, + "learning_rate": 1.8780971258671954e-06, + "loss": 0.867, + "step": 760 + }, + { + "epoch": 1.3547237076648841, + "eval_loss": 0.9822861552238464, + "eval_runtime": 46.8957, + "eval_samples_per_second": 21.324, + "eval_steps_per_second": 1.343, + "step": 760 + }, + { + "epoch": 1.35650623885918, + "grad_norm": 1.4034419059753418, + "learning_rate": 1.8731417244796829e-06, + "loss": 0.8609, + "step": 761 + }, + { + "epoch": 1.358288770053476, + "grad_norm": 1.3722320795059204, + "learning_rate": 1.8681863230921707e-06, + "loss": 0.8748, + "step": 762 + }, + { + "epoch": 1.3600713012477719, + "grad_norm": 1.5463820695877075, + "learning_rate": 1.8632309217046582e-06, + "loss": 0.8766, + "step": 763 + }, + { + "epoch": 1.3618538324420677, + "grad_norm": 1.416810154914856, + "learning_rate": 1.858275520317146e-06, + "loss": 0.9066, + "step": 764 + }, + { + "epoch": 1.3636363636363638, + "grad_norm": 1.387640118598938, + "learning_rate": 1.8533201189296335e-06, + "loss": 0.8714, + "step": 765 + }, + { + "epoch": 1.3654188948306596, + "grad_norm": 1.4583455324172974, + "learning_rate": 1.848364717542121e-06, + "loss": 0.8479, + "step": 766 + }, + { + "epoch": 1.3672014260249554, + "grad_norm": 1.587016224861145, + "learning_rate": 1.8434093161546088e-06, + "loss": 0.8807, + "step": 767 + }, + { + "epoch": 1.3689839572192513, + "grad_norm": 1.321548342704773, + "learning_rate": 1.8384539147670963e-06, + "loss": 0.8939, + "step": 768 + }, + { + "epoch": 1.3707664884135473, + "grad_norm": 1.3802001476287842, + "learning_rate": 1.8334985133795837e-06, + "loss": 0.8547, + "step": 769 + }, + { + "epoch": 1.3725490196078431, + "grad_norm": 1.4780387878417969, + "learning_rate": 1.8285431119920716e-06, + "loss": 0.8684, + "step": 770 + }, + { + "epoch": 1.3725490196078431, + "eval_loss": 0.9845251441001892, + "eval_runtime": 47.0226, + "eval_samples_per_second": 21.266, + "eval_steps_per_second": 1.34, + "step": 770 + }, + { + "epoch": 1.374331550802139, + "grad_norm": 1.3145530223846436, + "learning_rate": 1.823587710604559e-06, + "loss": 0.8808, + "step": 771 + }, + { + "epoch": 1.3761140819964348, + "grad_norm": 1.316864252090454, + "learning_rate": 1.818632309217047e-06, + "loss": 0.8819, + "step": 772 + }, + { + "epoch": 1.3778966131907309, + "grad_norm": 1.4071670770645142, + "learning_rate": 1.8136769078295344e-06, + "loss": 0.8472, + "step": 773 + }, + { + "epoch": 1.3796791443850267, + "grad_norm": 1.388598084449768, + "learning_rate": 1.8087215064420218e-06, + "loss": 0.8377, + "step": 774 + }, + { + "epoch": 1.3814616755793225, + "grad_norm": 1.3947046995162964, + "learning_rate": 1.8037661050545097e-06, + "loss": 0.8687, + "step": 775 + }, + { + "epoch": 1.3832442067736186, + "grad_norm": 1.363625407218933, + "learning_rate": 1.7988107036669972e-06, + "loss": 0.8834, + "step": 776 + }, + { + "epoch": 1.3850267379679144, + "grad_norm": 1.3984752893447876, + "learning_rate": 1.7938553022794846e-06, + "loss": 0.889, + "step": 777 + }, + { + "epoch": 1.3868092691622103, + "grad_norm": 1.3725652694702148, + "learning_rate": 1.7888999008919725e-06, + "loss": 0.876, + "step": 778 + }, + { + "epoch": 1.3885918003565063, + "grad_norm": 1.3153650760650635, + "learning_rate": 1.78394449950446e-06, + "loss": 0.8499, + "step": 779 + }, + { + "epoch": 1.3903743315508021, + "grad_norm": 1.5021432638168335, + "learning_rate": 1.7789890981169478e-06, + "loss": 0.8692, + "step": 780 + }, + { + "epoch": 1.3903743315508021, + "eval_loss": 0.9833415746688843, + "eval_runtime": 47.0835, + "eval_samples_per_second": 21.239, + "eval_steps_per_second": 1.338, + "step": 780 + }, + { + "epoch": 1.392156862745098, + "grad_norm": 1.4319101572036743, + "learning_rate": 1.7740336967294353e-06, + "loss": 0.8789, + "step": 781 + }, + { + "epoch": 1.393939393939394, + "grad_norm": 1.339253306388855, + "learning_rate": 1.7690782953419227e-06, + "loss": 0.8383, + "step": 782 + }, + { + "epoch": 1.3957219251336899, + "grad_norm": 1.3650999069213867, + "learning_rate": 1.7641228939544106e-06, + "loss": 0.8887, + "step": 783 + }, + { + "epoch": 1.3975044563279857, + "grad_norm": 1.4936254024505615, + "learning_rate": 1.759167492566898e-06, + "loss": 0.8612, + "step": 784 + }, + { + "epoch": 1.3992869875222818, + "grad_norm": 1.5137629508972168, + "learning_rate": 1.7542120911793855e-06, + "loss": 0.8589, + "step": 785 + }, + { + "epoch": 1.4010695187165776, + "grad_norm": 1.533828854560852, + "learning_rate": 1.7492566897918734e-06, + "loss": 0.9156, + "step": 786 + }, + { + "epoch": 1.4028520499108734, + "grad_norm": 1.531721830368042, + "learning_rate": 1.7443012884043608e-06, + "loss": 0.8586, + "step": 787 + }, + { + "epoch": 1.4046345811051695, + "grad_norm": 1.3086016178131104, + "learning_rate": 1.7393458870168487e-06, + "loss": 0.8745, + "step": 788 + }, + { + "epoch": 1.4064171122994653, + "grad_norm": 1.552501916885376, + "learning_rate": 1.7343904856293361e-06, + "loss": 0.8973, + "step": 789 + }, + { + "epoch": 1.4081996434937611, + "grad_norm": 1.5181113481521606, + "learning_rate": 1.7294350842418236e-06, + "loss": 0.8966, + "step": 790 + }, + { + "epoch": 1.4081996434937611, + "eval_loss": 0.9810673594474792, + "eval_runtime": 46.8684, + "eval_samples_per_second": 21.336, + "eval_steps_per_second": 1.344, + "step": 790 + }, + { + "epoch": 1.409982174688057, + "grad_norm": 1.4277552366256714, + "learning_rate": 1.7244796828543115e-06, + "loss": 0.8757, + "step": 791 + }, + { + "epoch": 1.4117647058823528, + "grad_norm": 1.5821561813354492, + "learning_rate": 1.719524281466799e-06, + "loss": 0.8413, + "step": 792 + }, + { + "epoch": 1.4135472370766489, + "grad_norm": 1.62177312374115, + "learning_rate": 1.7145688800792864e-06, + "loss": 0.8655, + "step": 793 + }, + { + "epoch": 1.4153297682709447, + "grad_norm": 1.4088196754455566, + "learning_rate": 1.7096134786917742e-06, + "loss": 0.8847, + "step": 794 + }, + { + "epoch": 1.4171122994652405, + "grad_norm": 1.3866719007492065, + "learning_rate": 1.7046580773042617e-06, + "loss": 0.8587, + "step": 795 + }, + { + "epoch": 1.4188948306595366, + "grad_norm": 1.3425517082214355, + "learning_rate": 1.6997026759167496e-06, + "loss": 0.8458, + "step": 796 + }, + { + "epoch": 1.4206773618538324, + "grad_norm": 1.6923450231552124, + "learning_rate": 1.694747274529237e-06, + "loss": 0.8479, + "step": 797 + }, + { + "epoch": 1.4224598930481283, + "grad_norm": 1.6441086530685425, + "learning_rate": 1.6897918731417245e-06, + "loss": 0.8601, + "step": 798 + }, + { + "epoch": 1.4242424242424243, + "grad_norm": 1.314751148223877, + "learning_rate": 1.6848364717542123e-06, + "loss": 0.8763, + "step": 799 + }, + { + "epoch": 1.4260249554367201, + "grad_norm": 1.3205382823944092, + "learning_rate": 1.6798810703666998e-06, + "loss": 0.8743, + "step": 800 + }, + { + "epoch": 1.4260249554367201, + "eval_loss": 0.9791409969329834, + "eval_runtime": 46.9219, + "eval_samples_per_second": 21.312, + "eval_steps_per_second": 1.343, + "step": 800 + }, + { + "epoch": 1.427807486631016, + "grad_norm": 1.3294618129730225, + "learning_rate": 1.6749256689791874e-06, + "loss": 0.9012, + "step": 801 + }, + { + "epoch": 1.429590017825312, + "grad_norm": 1.5094256401062012, + "learning_rate": 1.669970267591675e-06, + "loss": 0.8675, + "step": 802 + }, + { + "epoch": 1.4313725490196079, + "grad_norm": 1.41444993019104, + "learning_rate": 1.6650148662041625e-06, + "loss": 0.8706, + "step": 803 + }, + { + "epoch": 1.4331550802139037, + "grad_norm": 1.65440034866333, + "learning_rate": 1.6600594648166502e-06, + "loss": 0.8435, + "step": 804 + }, + { + "epoch": 1.4349376114081998, + "grad_norm": 1.6403052806854248, + "learning_rate": 1.6551040634291379e-06, + "loss": 0.874, + "step": 805 + }, + { + "epoch": 1.4367201426024956, + "grad_norm": 1.4437483549118042, + "learning_rate": 1.6501486620416255e-06, + "loss": 0.9185, + "step": 806 + }, + { + "epoch": 1.4385026737967914, + "grad_norm": 1.4905844926834106, + "learning_rate": 1.6451932606541132e-06, + "loss": 0.8883, + "step": 807 + }, + { + "epoch": 1.4402852049910875, + "grad_norm": 1.3557101488113403, + "learning_rate": 1.6402378592666006e-06, + "loss": 0.877, + "step": 808 + }, + { + "epoch": 1.4420677361853833, + "grad_norm": 1.3058723211288452, + "learning_rate": 1.6352824578790883e-06, + "loss": 0.8434, + "step": 809 + }, + { + "epoch": 1.4438502673796791, + "grad_norm": 1.2745517492294312, + "learning_rate": 1.630327056491576e-06, + "loss": 0.8342, + "step": 810 + }, + { + "epoch": 1.4438502673796791, + "eval_loss": 0.976764976978302, + "eval_runtime": 47.017, + "eval_samples_per_second": 21.269, + "eval_steps_per_second": 1.34, + "step": 810 + }, + { + "epoch": 1.445632798573975, + "grad_norm": 1.3594675064086914, + "learning_rate": 1.6253716551040636e-06, + "loss": 0.8595, + "step": 811 + }, + { + "epoch": 1.4474153297682708, + "grad_norm": 1.369518756866455, + "learning_rate": 1.620416253716551e-06, + "loss": 0.8829, + "step": 812 + }, + { + "epoch": 1.4491978609625669, + "grad_norm": 1.3764517307281494, + "learning_rate": 1.6154608523290387e-06, + "loss": 0.8823, + "step": 813 + }, + { + "epoch": 1.4509803921568627, + "grad_norm": 1.540014624595642, + "learning_rate": 1.6105054509415264e-06, + "loss": 0.8946, + "step": 814 + }, + { + "epoch": 1.4527629233511585, + "grad_norm": 1.6402170658111572, + "learning_rate": 1.605550049554014e-06, + "loss": 0.8679, + "step": 815 + }, + { + "epoch": 1.4545454545454546, + "grad_norm": 1.4860880374908447, + "learning_rate": 1.6005946481665015e-06, + "loss": 0.8545, + "step": 816 + }, + { + "epoch": 1.4563279857397504, + "grad_norm": 1.3628255128860474, + "learning_rate": 1.5956392467789892e-06, + "loss": 0.8528, + "step": 817 + }, + { + "epoch": 1.4581105169340463, + "grad_norm": 1.293563723564148, + "learning_rate": 1.5906838453914768e-06, + "loss": 0.8464, + "step": 818 + }, + { + "epoch": 1.4598930481283423, + "grad_norm": 1.3827394247055054, + "learning_rate": 1.5857284440039645e-06, + "loss": 0.8442, + "step": 819 + }, + { + "epoch": 1.4616755793226381, + "grad_norm": 1.3776166439056396, + "learning_rate": 1.580773042616452e-06, + "loss": 0.864, + "step": 820 + }, + { + "epoch": 1.4616755793226381, + "eval_loss": 0.9785693883895874, + "eval_runtime": 46.9106, + "eval_samples_per_second": 21.317, + "eval_steps_per_second": 1.343, + "step": 820 + }, + { + "epoch": 1.463458110516934, + "grad_norm": 1.3489983081817627, + "learning_rate": 1.5758176412289396e-06, + "loss": 0.8603, + "step": 821 + }, + { + "epoch": 1.46524064171123, + "grad_norm": 1.3305773735046387, + "learning_rate": 1.5708622398414273e-06, + "loss": 0.8598, + "step": 822 + }, + { + "epoch": 1.4670231729055259, + "grad_norm": 1.4561748504638672, + "learning_rate": 1.565906838453915e-06, + "loss": 0.884, + "step": 823 + }, + { + "epoch": 1.4688057040998217, + "grad_norm": 1.3868783712387085, + "learning_rate": 1.5609514370664026e-06, + "loss": 0.894, + "step": 824 + }, + { + "epoch": 1.4705882352941178, + "grad_norm": 1.3360710144042969, + "learning_rate": 1.55599603567889e-06, + "loss": 0.8581, + "step": 825 + }, + { + "epoch": 1.4723707664884136, + "grad_norm": 1.3062288761138916, + "learning_rate": 1.5510406342913777e-06, + "loss": 0.8703, + "step": 826 + }, + { + "epoch": 1.4741532976827094, + "grad_norm": 1.3395740985870361, + "learning_rate": 1.5460852329038654e-06, + "loss": 0.8906, + "step": 827 + }, + { + "epoch": 1.4759358288770055, + "grad_norm": 1.5396114587783813, + "learning_rate": 1.5411298315163528e-06, + "loss": 0.8845, + "step": 828 + }, + { + "epoch": 1.4777183600713013, + "grad_norm": 1.420955777168274, + "learning_rate": 1.5361744301288407e-06, + "loss": 0.8841, + "step": 829 + }, + { + "epoch": 1.4795008912655971, + "grad_norm": 1.4807546138763428, + "learning_rate": 1.5312190287413281e-06, + "loss": 0.8091, + "step": 830 + }, + { + "epoch": 1.4795008912655971, + "eval_loss": 0.9777600169181824, + "eval_runtime": 46.9306, + "eval_samples_per_second": 21.308, + "eval_steps_per_second": 1.342, + "step": 830 + }, + { + "epoch": 1.481283422459893, + "grad_norm": 1.5020588636398315, + "learning_rate": 1.5262636273538158e-06, + "loss": 0.8964, + "step": 831 + }, + { + "epoch": 1.483065953654189, + "grad_norm": 1.3068898916244507, + "learning_rate": 1.5213082259663035e-06, + "loss": 0.8868, + "step": 832 + }, + { + "epoch": 1.4848484848484849, + "grad_norm": 1.448590636253357, + "learning_rate": 1.516352824578791e-06, + "loss": 0.8554, + "step": 833 + }, + { + "epoch": 1.4866310160427807, + "grad_norm": 1.4764806032180786, + "learning_rate": 1.5113974231912786e-06, + "loss": 0.8683, + "step": 834 + }, + { + "epoch": 1.4884135472370765, + "grad_norm": 1.5730153322219849, + "learning_rate": 1.5064420218037662e-06, + "loss": 0.8511, + "step": 835 + }, + { + "epoch": 1.4901960784313726, + "grad_norm": 1.454858422279358, + "learning_rate": 1.5014866204162537e-06, + "loss": 0.8801, + "step": 836 + }, + { + "epoch": 1.4919786096256684, + "grad_norm": 1.2339946031570435, + "learning_rate": 1.4965312190287416e-06, + "loss": 0.8551, + "step": 837 + }, + { + "epoch": 1.4937611408199643, + "grad_norm": 1.4510794878005981, + "learning_rate": 1.491575817641229e-06, + "loss": 0.8639, + "step": 838 + }, + { + "epoch": 1.4955436720142603, + "grad_norm": 1.4182809591293335, + "learning_rate": 1.4866204162537167e-06, + "loss": 0.8486, + "step": 839 + }, + { + "epoch": 1.4973262032085561, + "grad_norm": 1.485177993774414, + "learning_rate": 1.4816650148662043e-06, + "loss": 0.8787, + "step": 840 + }, + { + "epoch": 1.4973262032085561, + "eval_loss": 0.9784544706344604, + "eval_runtime": 47.0602, + "eval_samples_per_second": 21.249, + "eval_steps_per_second": 1.339, + "step": 840 + }, + { + "epoch": 1.499108734402852, + "grad_norm": 1.579306721687317, + "learning_rate": 1.4767096134786918e-06, + "loss": 0.891, + "step": 841 + }, + { + "epoch": 1.500891265597148, + "grad_norm": 1.5453161001205444, + "learning_rate": 1.4717542120911797e-06, + "loss": 0.8892, + "step": 842 + }, + { + "epoch": 1.5026737967914439, + "grad_norm": 1.4892089366912842, + "learning_rate": 1.4667988107036671e-06, + "loss": 0.8884, + "step": 843 + }, + { + "epoch": 1.5044563279857397, + "grad_norm": 1.328150987625122, + "learning_rate": 1.4618434093161546e-06, + "loss": 0.8631, + "step": 844 + }, + { + "epoch": 1.5062388591800357, + "grad_norm": 1.3877817392349243, + "learning_rate": 1.4568880079286424e-06, + "loss": 0.8622, + "step": 845 + }, + { + "epoch": 1.5080213903743316, + "grad_norm": 1.4985857009887695, + "learning_rate": 1.4519326065411299e-06, + "loss": 0.8857, + "step": 846 + }, + { + "epoch": 1.5098039215686274, + "grad_norm": 1.3666197061538696, + "learning_rate": 1.4469772051536178e-06, + "loss": 0.823, + "step": 847 + }, + { + "epoch": 1.5115864527629235, + "grad_norm": 1.5058780908584595, + "learning_rate": 1.4420218037661052e-06, + "loss": 0.893, + "step": 848 + }, + { + "epoch": 1.5133689839572193, + "grad_norm": 1.4480105638504028, + "learning_rate": 1.4370664023785927e-06, + "loss": 0.869, + "step": 849 + }, + { + "epoch": 1.5151515151515151, + "grad_norm": 1.503057599067688, + "learning_rate": 1.4321110009910805e-06, + "loss": 0.8804, + "step": 850 + }, + { + "epoch": 1.5151515151515151, + "eval_loss": 0.9779770374298096, + "eval_runtime": 47.0961, + "eval_samples_per_second": 21.233, + "eval_steps_per_second": 1.338, + "step": 850 + }, + { + "epoch": 1.5169340463458112, + "grad_norm": 1.4159053564071655, + "learning_rate": 1.427155599603568e-06, + "loss": 0.8564, + "step": 851 + }, + { + "epoch": 1.5187165775401068, + "grad_norm": 1.3249149322509766, + "learning_rate": 1.4222001982160554e-06, + "loss": 0.8598, + "step": 852 + }, + { + "epoch": 1.5204991087344029, + "grad_norm": 1.7462847232818604, + "learning_rate": 1.4172447968285433e-06, + "loss": 0.8788, + "step": 853 + }, + { + "epoch": 1.522281639928699, + "grad_norm": 1.4859366416931152, + "learning_rate": 1.4122893954410308e-06, + "loss": 0.8624, + "step": 854 + }, + { + "epoch": 1.5240641711229945, + "grad_norm": 1.4355590343475342, + "learning_rate": 1.4073339940535186e-06, + "loss": 0.8639, + "step": 855 + }, + { + "epoch": 1.5258467023172906, + "grad_norm": 1.3695921897888184, + "learning_rate": 1.402378592666006e-06, + "loss": 0.9011, + "step": 856 + }, + { + "epoch": 1.5276292335115864, + "grad_norm": 1.4948954582214355, + "learning_rate": 1.3974231912784935e-06, + "loss": 0.8689, + "step": 857 + }, + { + "epoch": 1.5294117647058822, + "grad_norm": 1.4612298011779785, + "learning_rate": 1.3924677898909814e-06, + "loss": 0.8753, + "step": 858 + }, + { + "epoch": 1.5311942959001783, + "grad_norm": 1.3478108644485474, + "learning_rate": 1.3875123885034689e-06, + "loss": 0.8684, + "step": 859 + }, + { + "epoch": 1.5329768270944741, + "grad_norm": 1.349802017211914, + "learning_rate": 1.3825569871159563e-06, + "loss": 0.8532, + "step": 860 + }, + { + "epoch": 1.5329768270944741, + "eval_loss": 0.9780614972114563, + "eval_runtime": 46.8154, + "eval_samples_per_second": 21.36, + "eval_steps_per_second": 1.346, + "step": 860 + }, + { + "epoch": 1.53475935828877, + "grad_norm": 1.3854540586471558, + "learning_rate": 1.3776015857284442e-06, + "loss": 0.8555, + "step": 861 + }, + { + "epoch": 1.536541889483066, + "grad_norm": 1.5170377492904663, + "learning_rate": 1.3726461843409316e-06, + "loss": 0.868, + "step": 862 + }, + { + "epoch": 1.5383244206773619, + "grad_norm": 1.2990375757217407, + "learning_rate": 1.3676907829534195e-06, + "loss": 0.8884, + "step": 863 + }, + { + "epoch": 1.5401069518716577, + "grad_norm": 1.7202647924423218, + "learning_rate": 1.362735381565907e-06, + "loss": 0.8877, + "step": 864 + }, + { + "epoch": 1.5418894830659537, + "grad_norm": 1.5239756107330322, + "learning_rate": 1.3577799801783944e-06, + "loss": 0.8492, + "step": 865 + }, + { + "epoch": 1.5436720142602496, + "grad_norm": 1.6998838186264038, + "learning_rate": 1.3528245787908823e-06, + "loss": 0.8541, + "step": 866 + }, + { + "epoch": 1.5454545454545454, + "grad_norm": 1.5735970735549927, + "learning_rate": 1.3478691774033697e-06, + "loss": 0.839, + "step": 867 + }, + { + "epoch": 1.5472370766488415, + "grad_norm": 1.411190152168274, + "learning_rate": 1.3429137760158572e-06, + "loss": 0.8485, + "step": 868 + }, + { + "epoch": 1.5490196078431373, + "grad_norm": 1.404719591140747, + "learning_rate": 1.337958374628345e-06, + "loss": 0.8878, + "step": 869 + }, + { + "epoch": 1.5508021390374331, + "grad_norm": 1.3522731065750122, + "learning_rate": 1.3330029732408325e-06, + "loss": 0.8515, + "step": 870 + }, + { + "epoch": 1.5508021390374331, + "eval_loss": 0.9770654439926147, + "eval_runtime": 46.8266, + "eval_samples_per_second": 21.355, + "eval_steps_per_second": 1.345, + "step": 870 + }, + { + "epoch": 1.5525846702317292, + "grad_norm": 1.5110260248184204, + "learning_rate": 1.3280475718533204e-06, + "loss": 0.8872, + "step": 871 + }, + { + "epoch": 1.5543672014260248, + "grad_norm": 1.3744357824325562, + "learning_rate": 1.3230921704658078e-06, + "loss": 0.846, + "step": 872 + }, + { + "epoch": 1.5561497326203209, + "grad_norm": 1.3850080966949463, + "learning_rate": 1.3181367690782953e-06, + "loss": 0.8583, + "step": 873 + }, + { + "epoch": 1.557932263814617, + "grad_norm": 1.3675907850265503, + "learning_rate": 1.3131813676907832e-06, + "loss": 0.8449, + "step": 874 + }, + { + "epoch": 1.5597147950089125, + "grad_norm": 1.5432913303375244, + "learning_rate": 1.3082259663032706e-06, + "loss": 0.8614, + "step": 875 + }, + { + "epoch": 1.5614973262032086, + "grad_norm": 1.450357437133789, + "learning_rate": 1.303270564915758e-06, + "loss": 0.879, + "step": 876 + }, + { + "epoch": 1.5632798573975044, + "grad_norm": 1.320804238319397, + "learning_rate": 1.298315163528246e-06, + "loss": 0.8628, + "step": 877 + }, + { + "epoch": 1.5650623885918002, + "grad_norm": 1.6349151134490967, + "learning_rate": 1.2933597621407334e-06, + "loss": 0.8911, + "step": 878 + }, + { + "epoch": 1.5668449197860963, + "grad_norm": 1.7157044410705566, + "learning_rate": 1.2884043607532213e-06, + "loss": 0.8828, + "step": 879 + }, + { + "epoch": 1.5686274509803921, + "grad_norm": 1.4008780717849731, + "learning_rate": 1.2834489593657087e-06, + "loss": 0.8749, + "step": 880 + }, + { + "epoch": 1.5686274509803921, + "eval_loss": 0.9742602705955505, + "eval_runtime": 46.9284, + "eval_samples_per_second": 21.309, + "eval_steps_per_second": 1.342, + "step": 880 + }, + { + "epoch": 1.570409982174688, + "grad_norm": 1.318920373916626, + "learning_rate": 1.2784935579781962e-06, + "loss": 0.8446, + "step": 881 + }, + { + "epoch": 1.572192513368984, + "grad_norm": 1.393837332725525, + "learning_rate": 1.273538156590684e-06, + "loss": 0.8629, + "step": 882 + }, + { + "epoch": 1.5739750445632799, + "grad_norm": 1.3566768169403076, + "learning_rate": 1.2685827552031715e-06, + "loss": 0.8621, + "step": 883 + }, + { + "epoch": 1.5757575757575757, + "grad_norm": 1.699874758720398, + "learning_rate": 1.2636273538156591e-06, + "loss": 0.8613, + "step": 884 + }, + { + "epoch": 1.5775401069518717, + "grad_norm": 1.401503086090088, + "learning_rate": 1.2586719524281468e-06, + "loss": 0.8455, + "step": 885 + }, + { + "epoch": 1.5793226381461676, + "grad_norm": 1.5046734809875488, + "learning_rate": 1.2537165510406342e-06, + "loss": 0.86, + "step": 886 + }, + { + "epoch": 1.5811051693404634, + "grad_norm": 1.4957525730133057, + "learning_rate": 1.248761149653122e-06, + "loss": 0.8512, + "step": 887 + }, + { + "epoch": 1.5828877005347595, + "grad_norm": 1.4307347536087036, + "learning_rate": 1.2438057482656096e-06, + "loss": 0.8533, + "step": 888 + }, + { + "epoch": 1.5846702317290553, + "grad_norm": 1.4483654499053955, + "learning_rate": 1.2388503468780972e-06, + "loss": 0.8619, + "step": 889 + }, + { + "epoch": 1.5864527629233511, + "grad_norm": 1.3696205615997314, + "learning_rate": 1.2338949454905847e-06, + "loss": 0.8602, + "step": 890 + }, + { + "epoch": 1.5864527629233511, + "eval_loss": 0.9740327596664429, + "eval_runtime": 46.7849, + "eval_samples_per_second": 21.374, + "eval_steps_per_second": 1.347, + "step": 890 + }, + { + "epoch": 1.5882352941176472, + "grad_norm": 1.479094386100769, + "learning_rate": 1.2289395441030723e-06, + "loss": 0.8781, + "step": 891 + }, + { + "epoch": 1.5900178253119428, + "grad_norm": 1.5047471523284912, + "learning_rate": 1.22398414271556e-06, + "loss": 0.8956, + "step": 892 + }, + { + "epoch": 1.5918003565062389, + "grad_norm": 1.5792458057403564, + "learning_rate": 1.2190287413280477e-06, + "loss": 0.843, + "step": 893 + }, + { + "epoch": 1.593582887700535, + "grad_norm": 1.5192897319793701, + "learning_rate": 1.2140733399405351e-06, + "loss": 0.9011, + "step": 894 + }, + { + "epoch": 1.5953654188948305, + "grad_norm": 1.5448248386383057, + "learning_rate": 1.2091179385530228e-06, + "loss": 0.8518, + "step": 895 + }, + { + "epoch": 1.5971479500891266, + "grad_norm": 1.4314959049224854, + "learning_rate": 1.2041625371655104e-06, + "loss": 0.8539, + "step": 896 + }, + { + "epoch": 1.5989304812834224, + "grad_norm": 1.321249008178711, + "learning_rate": 1.1992071357779981e-06, + "loss": 0.8342, + "step": 897 + }, + { + "epoch": 1.6007130124777182, + "grad_norm": 1.3444586992263794, + "learning_rate": 1.1942517343904858e-06, + "loss": 0.8595, + "step": 898 + }, + { + "epoch": 1.6024955436720143, + "grad_norm": 1.4096806049346924, + "learning_rate": 1.1892963330029732e-06, + "loss": 0.8736, + "step": 899 + }, + { + "epoch": 1.6042780748663101, + "grad_norm": 1.3600504398345947, + "learning_rate": 1.1843409316154609e-06, + "loss": 0.865, + "step": 900 + }, + { + "epoch": 1.6042780748663101, + "eval_loss": 0.9747118949890137, + "eval_runtime": 46.7507, + "eval_samples_per_second": 21.39, + "eval_steps_per_second": 1.348, + "step": 900 + }, + { + "epoch": 1.606060606060606, + "grad_norm": 1.4022847414016724, + "learning_rate": 1.1793855302279485e-06, + "loss": 0.8613, + "step": 901 + }, + { + "epoch": 1.607843137254902, + "grad_norm": 1.4285733699798584, + "learning_rate": 1.1744301288404362e-06, + "loss": 0.8422, + "step": 902 + }, + { + "epoch": 1.6096256684491979, + "grad_norm": 1.4409127235412598, + "learning_rate": 1.1694747274529237e-06, + "loss": 0.8812, + "step": 903 + }, + { + "epoch": 1.6114081996434937, + "grad_norm": 1.4568157196044922, + "learning_rate": 1.1645193260654113e-06, + "loss": 0.8614, + "step": 904 + }, + { + "epoch": 1.6131907308377897, + "grad_norm": 1.3659733533859253, + "learning_rate": 1.159563924677899e-06, + "loss": 0.8476, + "step": 905 + }, + { + "epoch": 1.6149732620320856, + "grad_norm": 1.2912815809249878, + "learning_rate": 1.1546085232903866e-06, + "loss": 0.8716, + "step": 906 + }, + { + "epoch": 1.6167557932263814, + "grad_norm": 1.267293095588684, + "learning_rate": 1.1496531219028743e-06, + "loss": 0.844, + "step": 907 + }, + { + "epoch": 1.6185383244206775, + "grad_norm": 1.3510090112686157, + "learning_rate": 1.1446977205153618e-06, + "loss": 0.8698, + "step": 908 + }, + { + "epoch": 1.6203208556149733, + "grad_norm": 1.3630146980285645, + "learning_rate": 1.1397423191278494e-06, + "loss": 0.8503, + "step": 909 + }, + { + "epoch": 1.6221033868092691, + "grad_norm": 1.4114625453948975, + "learning_rate": 1.134786917740337e-06, + "loss": 0.8918, + "step": 910 + }, + { + "epoch": 1.6221033868092691, + "eval_loss": 0.9753348231315613, + "eval_runtime": 46.5853, + "eval_samples_per_second": 21.466, + "eval_steps_per_second": 1.352, + "step": 910 + }, + { + "epoch": 1.6238859180035652, + "grad_norm": 1.4507378339767456, + "learning_rate": 1.1298315163528247e-06, + "loss": 0.8544, + "step": 911 + }, + { + "epoch": 1.6256684491978608, + "grad_norm": 1.4346193075180054, + "learning_rate": 1.1248761149653122e-06, + "loss": 0.8598, + "step": 912 + }, + { + "epoch": 1.6274509803921569, + "grad_norm": 1.3302711248397827, + "learning_rate": 1.1199207135777999e-06, + "loss": 0.8577, + "step": 913 + }, + { + "epoch": 1.629233511586453, + "grad_norm": 1.3755695819854736, + "learning_rate": 1.1149653121902875e-06, + "loss": 0.8837, + "step": 914 + }, + { + "epoch": 1.6310160427807485, + "grad_norm": 1.3624552488327026, + "learning_rate": 1.1100099108027752e-06, + "loss": 0.842, + "step": 915 + }, + { + "epoch": 1.6327985739750446, + "grad_norm": 1.3074328899383545, + "learning_rate": 1.1050545094152628e-06, + "loss": 0.8627, + "step": 916 + }, + { + "epoch": 1.6345811051693404, + "grad_norm": 1.367962121963501, + "learning_rate": 1.1000991080277503e-06, + "loss": 0.8497, + "step": 917 + }, + { + "epoch": 1.6363636363636362, + "grad_norm": 1.4831198453903198, + "learning_rate": 1.095143706640238e-06, + "loss": 0.8663, + "step": 918 + }, + { + "epoch": 1.6381461675579323, + "grad_norm": 1.3029274940490723, + "learning_rate": 1.0901883052527256e-06, + "loss": 0.8619, + "step": 919 + }, + { + "epoch": 1.6399286987522281, + "grad_norm": 1.30776047706604, + "learning_rate": 1.0852329038652133e-06, + "loss": 0.8964, + "step": 920 + }, + { + "epoch": 1.6399286987522281, + "eval_loss": 0.9733108878135681, + "eval_runtime": 46.6339, + "eval_samples_per_second": 21.444, + "eval_steps_per_second": 1.351, + "step": 920 + }, + { + "epoch": 1.641711229946524, + "grad_norm": 1.4224649667739868, + "learning_rate": 1.080277502477701e-06, + "loss": 0.8656, + "step": 921 + }, + { + "epoch": 1.64349376114082, + "grad_norm": 1.3711802959442139, + "learning_rate": 1.0753221010901884e-06, + "loss": 0.8649, + "step": 922 + }, + { + "epoch": 1.6452762923351159, + "grad_norm": 1.336634635925293, + "learning_rate": 1.070366699702676e-06, + "loss": 0.846, + "step": 923 + }, + { + "epoch": 1.6470588235294117, + "grad_norm": 1.5247148275375366, + "learning_rate": 1.0654112983151637e-06, + "loss": 0.885, + "step": 924 + }, + { + "epoch": 1.6488413547237077, + "grad_norm": 1.6466078758239746, + "learning_rate": 1.0604558969276514e-06, + "loss": 0.8769, + "step": 925 + }, + { + "epoch": 1.6506238859180036, + "grad_norm": 1.4711053371429443, + "learning_rate": 1.0555004955401388e-06, + "loss": 0.8402, + "step": 926 + }, + { + "epoch": 1.6524064171122994, + "grad_norm": 1.3819425106048584, + "learning_rate": 1.0505450941526265e-06, + "loss": 0.8406, + "step": 927 + }, + { + "epoch": 1.6541889483065955, + "grad_norm": 1.3762542009353638, + "learning_rate": 1.0455896927651141e-06, + "loss": 0.8703, + "step": 928 + }, + { + "epoch": 1.6559714795008913, + "grad_norm": 1.4304357767105103, + "learning_rate": 1.0406342913776018e-06, + "loss": 0.8701, + "step": 929 + }, + { + "epoch": 1.6577540106951871, + "grad_norm": 1.425401210784912, + "learning_rate": 1.0356788899900893e-06, + "loss": 0.8959, + "step": 930 + }, + { + "epoch": 1.6577540106951871, + "eval_loss": 0.9722786545753479, + "eval_runtime": 46.6071, + "eval_samples_per_second": 21.456, + "eval_steps_per_second": 1.352, + "step": 930 + }, + { + "epoch": 1.6595365418894832, + "grad_norm": 1.4085595607757568, + "learning_rate": 1.030723488602577e-06, + "loss": 0.8659, + "step": 931 + }, + { + "epoch": 1.661319073083779, + "grad_norm": 1.3777707815170288, + "learning_rate": 1.0257680872150646e-06, + "loss": 0.8653, + "step": 932 + }, + { + "epoch": 1.6631016042780749, + "grad_norm": 1.3614346981048584, + "learning_rate": 1.0208126858275522e-06, + "loss": 0.8484, + "step": 933 + }, + { + "epoch": 1.664884135472371, + "grad_norm": 1.5707229375839233, + "learning_rate": 1.0158572844400397e-06, + "loss": 0.8472, + "step": 934 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 1.4438365697860718, + "learning_rate": 1.0109018830525274e-06, + "loss": 0.8725, + "step": 935 + }, + { + "epoch": 1.6684491978609626, + "grad_norm": 1.4993953704833984, + "learning_rate": 1.005946481665015e-06, + "loss": 0.8674, + "step": 936 + }, + { + "epoch": 1.6702317290552586, + "grad_norm": 1.4290693998336792, + "learning_rate": 1.0009910802775027e-06, + "loss": 0.9074, + "step": 937 + }, + { + "epoch": 1.6720142602495542, + "grad_norm": 1.3130066394805908, + "learning_rate": 9.960356788899901e-07, + "loss": 0.8474, + "step": 938 + }, + { + "epoch": 1.6737967914438503, + "grad_norm": 1.410224199295044, + "learning_rate": 9.910802775024778e-07, + "loss": 0.8933, + "step": 939 + }, + { + "epoch": 1.6755793226381461, + "grad_norm": 1.414833426475525, + "learning_rate": 9.861248761149655e-07, + "loss": 0.8384, + "step": 940 + }, + { + "epoch": 1.6755793226381461, + "eval_loss": 0.972200870513916, + "eval_runtime": 46.6515, + "eval_samples_per_second": 21.436, + "eval_steps_per_second": 1.35, + "step": 940 + }, + { + "epoch": 1.677361853832442, + "grad_norm": 1.4031621217727661, + "learning_rate": 9.811694747274531e-07, + "loss": 0.8616, + "step": 941 + }, + { + "epoch": 1.679144385026738, + "grad_norm": 1.359629511833191, + "learning_rate": 9.762140733399406e-07, + "loss": 0.8736, + "step": 942 + }, + { + "epoch": 1.6809269162210339, + "grad_norm": 1.4111278057098389, + "learning_rate": 9.712586719524282e-07, + "loss": 0.8786, + "step": 943 + }, + { + "epoch": 1.6827094474153297, + "grad_norm": 1.2958585023880005, + "learning_rate": 9.663032705649159e-07, + "loss": 0.8778, + "step": 944 + }, + { + "epoch": 1.6844919786096257, + "grad_norm": 1.2840744256973267, + "learning_rate": 9.613478691774035e-07, + "loss": 0.8403, + "step": 945 + }, + { + "epoch": 1.6862745098039216, + "grad_norm": 1.3533388376235962, + "learning_rate": 9.56392467789891e-07, + "loss": 0.8607, + "step": 946 + }, + { + "epoch": 1.6880570409982174, + "grad_norm": 1.4401700496673584, + "learning_rate": 9.514370664023787e-07, + "loss": 0.8829, + "step": 947 + }, + { + "epoch": 1.6898395721925135, + "grad_norm": 1.36722731590271, + "learning_rate": 9.464816650148663e-07, + "loss": 0.8987, + "step": 948 + }, + { + "epoch": 1.6916221033868093, + "grad_norm": 1.4618425369262695, + "learning_rate": 9.41526263627354e-07, + "loss": 0.8751, + "step": 949 + }, + { + "epoch": 1.6934046345811051, + "grad_norm": 1.316740870475769, + "learning_rate": 9.365708622398414e-07, + "loss": 0.8744, + "step": 950 + }, + { + "epoch": 1.6934046345811051, + "eval_loss": 0.9702951312065125, + "eval_runtime": 46.6377, + "eval_samples_per_second": 21.442, + "eval_steps_per_second": 1.351, + "step": 950 + }, + { + "epoch": 1.6951871657754012, + "grad_norm": 1.2979589700698853, + "learning_rate": 9.316154608523291e-07, + "loss": 0.8648, + "step": 951 + }, + { + "epoch": 1.696969696969697, + "grad_norm": 1.327683448791504, + "learning_rate": 9.266600594648168e-07, + "loss": 0.8673, + "step": 952 + }, + { + "epoch": 1.6987522281639929, + "grad_norm": 1.37446928024292, + "learning_rate": 9.217046580773044e-07, + "loss": 0.8694, + "step": 953 + }, + { + "epoch": 1.700534759358289, + "grad_norm": 1.4593381881713867, + "learning_rate": 9.167492566897919e-07, + "loss": 0.8567, + "step": 954 + }, + { + "epoch": 1.7023172905525845, + "grad_norm": 1.3869240283966064, + "learning_rate": 9.117938553022795e-07, + "loss": 0.8561, + "step": 955 + }, + { + "epoch": 1.7040998217468806, + "grad_norm": 1.4869533777236938, + "learning_rate": 9.068384539147672e-07, + "loss": 0.8749, + "step": 956 + }, + { + "epoch": 1.7058823529411766, + "grad_norm": 1.421717882156372, + "learning_rate": 9.018830525272549e-07, + "loss": 0.8468, + "step": 957 + }, + { + "epoch": 1.7076648841354722, + "grad_norm": 1.276408314704895, + "learning_rate": 8.969276511397423e-07, + "loss": 0.8563, + "step": 958 + }, + { + "epoch": 1.7094474153297683, + "grad_norm": 1.4118410348892212, + "learning_rate": 8.9197224975223e-07, + "loss": 0.8841, + "step": 959 + }, + { + "epoch": 1.7112299465240641, + "grad_norm": 1.3833427429199219, + "learning_rate": 8.870168483647176e-07, + "loss": 0.8331, + "step": 960 + }, + { + "epoch": 1.7112299465240641, + "eval_loss": 0.9688822627067566, + "eval_runtime": 46.5438, + "eval_samples_per_second": 21.485, + "eval_steps_per_second": 1.354, + "step": 960 + }, + { + "epoch": 1.71301247771836, + "grad_norm": 1.2577204704284668, + "learning_rate": 8.820614469772053e-07, + "loss": 0.843, + "step": 961 + }, + { + "epoch": 1.714795008912656, + "grad_norm": 1.4835284948349, + "learning_rate": 8.771060455896927e-07, + "loss": 0.8575, + "step": 962 + }, + { + "epoch": 1.7165775401069518, + "grad_norm": 1.2658771276474, + "learning_rate": 8.721506442021804e-07, + "loss": 0.8559, + "step": 963 + }, + { + "epoch": 1.7183600713012477, + "grad_norm": 1.3600130081176758, + "learning_rate": 8.671952428146681e-07, + "loss": 0.8782, + "step": 964 + }, + { + "epoch": 1.7201426024955437, + "grad_norm": 1.4301601648330688, + "learning_rate": 8.622398414271557e-07, + "loss": 0.8856, + "step": 965 + }, + { + "epoch": 1.7219251336898396, + "grad_norm": 1.3931207656860352, + "learning_rate": 8.572844400396432e-07, + "loss": 0.8804, + "step": 966 + }, + { + "epoch": 1.7237076648841354, + "grad_norm": 1.3883016109466553, + "learning_rate": 8.523290386521308e-07, + "loss": 0.855, + "step": 967 + }, + { + "epoch": 1.7254901960784315, + "grad_norm": 1.6030256748199463, + "learning_rate": 8.473736372646185e-07, + "loss": 0.8384, + "step": 968 + }, + { + "epoch": 1.7272727272727273, + "grad_norm": 1.3320417404174805, + "learning_rate": 8.424182358771062e-07, + "loss": 0.8611, + "step": 969 + }, + { + "epoch": 1.7290552584670231, + "grad_norm": 1.3974928855895996, + "learning_rate": 8.374628344895937e-07, + "loss": 0.8352, + "step": 970 + }, + { + "epoch": 1.7290552584670231, + "eval_loss": 0.9675564169883728, + "eval_runtime": 46.6655, + "eval_samples_per_second": 21.429, + "eval_steps_per_second": 1.35, + "step": 970 + }, + { + "epoch": 1.7308377896613192, + "grad_norm": 1.4703048467636108, + "learning_rate": 8.325074331020813e-07, + "loss": 0.8202, + "step": 971 + }, + { + "epoch": 1.732620320855615, + "grad_norm": 1.364524245262146, + "learning_rate": 8.275520317145689e-07, + "loss": 0.8476, + "step": 972 + }, + { + "epoch": 1.7344028520499108, + "grad_norm": 1.377961277961731, + "learning_rate": 8.225966303270566e-07, + "loss": 0.8582, + "step": 973 + }, + { + "epoch": 1.736185383244207, + "grad_norm": 1.3737229108810425, + "learning_rate": 8.176412289395442e-07, + "loss": 0.8649, + "step": 974 + }, + { + "epoch": 1.7379679144385025, + "grad_norm": 1.5605295896530151, + "learning_rate": 8.126858275520318e-07, + "loss": 0.8465, + "step": 975 + }, + { + "epoch": 1.7397504456327986, + "grad_norm": 1.3562301397323608, + "learning_rate": 8.077304261645194e-07, + "loss": 0.8662, + "step": 976 + }, + { + "epoch": 1.7415329768270946, + "grad_norm": 1.3104592561721802, + "learning_rate": 8.02775024777007e-07, + "loss": 0.8463, + "step": 977 + }, + { + "epoch": 1.7433155080213902, + "grad_norm": 1.5199209451675415, + "learning_rate": 7.978196233894946e-07, + "loss": 0.8942, + "step": 978 + }, + { + "epoch": 1.7450980392156863, + "grad_norm": 1.464303970336914, + "learning_rate": 7.928642220019823e-07, + "loss": 0.8493, + "step": 979 + }, + { + "epoch": 1.7468805704099821, + "grad_norm": 1.5610185861587524, + "learning_rate": 7.879088206144698e-07, + "loss": 0.8577, + "step": 980 + }, + { + "epoch": 1.7468805704099821, + "eval_loss": 0.9670752882957458, + "eval_runtime": 46.6969, + "eval_samples_per_second": 21.415, + "eval_steps_per_second": 1.349, + "step": 980 + }, + { + "epoch": 1.748663101604278, + "grad_norm": 1.378450870513916, + "learning_rate": 7.829534192269575e-07, + "loss": 0.8659, + "step": 981 + }, + { + "epoch": 1.750445632798574, + "grad_norm": 1.45065176486969, + "learning_rate": 7.77998017839445e-07, + "loss": 0.8607, + "step": 982 + }, + { + "epoch": 1.7522281639928698, + "grad_norm": 1.4157710075378418, + "learning_rate": 7.730426164519327e-07, + "loss": 0.8625, + "step": 983 + }, + { + "epoch": 1.7540106951871657, + "grad_norm": 1.481164813041687, + "learning_rate": 7.680872150644203e-07, + "loss": 0.864, + "step": 984 + }, + { + "epoch": 1.7557932263814617, + "grad_norm": 1.3908096551895142, + "learning_rate": 7.631318136769079e-07, + "loss": 0.8572, + "step": 985 + }, + { + "epoch": 1.7575757575757576, + "grad_norm": 1.392127275466919, + "learning_rate": 7.581764122893955e-07, + "loss": 0.8524, + "step": 986 + }, + { + "epoch": 1.7593582887700534, + "grad_norm": 1.3347569704055786, + "learning_rate": 7.532210109018831e-07, + "loss": 0.8442, + "step": 987 + }, + { + "epoch": 1.7611408199643495, + "grad_norm": 1.4272398948669434, + "learning_rate": 7.482656095143708e-07, + "loss": 0.8515, + "step": 988 + }, + { + "epoch": 1.7629233511586453, + "grad_norm": 1.5819486379623413, + "learning_rate": 7.433102081268583e-07, + "loss": 0.8543, + "step": 989 + }, + { + "epoch": 1.7647058823529411, + "grad_norm": 1.479175090789795, + "learning_rate": 7.383548067393459e-07, + "loss": 0.8975, + "step": 990 + }, + { + "epoch": 1.7647058823529411, + "eval_loss": 0.967066764831543, + "eval_runtime": 46.4369, + "eval_samples_per_second": 21.535, + "eval_steps_per_second": 1.357, + "step": 990 + }, + { + "epoch": 1.7664884135472372, + "grad_norm": 1.5135704278945923, + "learning_rate": 7.333994053518336e-07, + "loss": 0.8402, + "step": 991 + }, + { + "epoch": 1.768270944741533, + "grad_norm": 1.5999902486801147, + "learning_rate": 7.284440039643212e-07, + "loss": 0.8429, + "step": 992 + }, + { + "epoch": 1.7700534759358288, + "grad_norm": 1.5118600130081177, + "learning_rate": 7.234886025768089e-07, + "loss": 0.8635, + "step": 993 + }, + { + "epoch": 1.771836007130125, + "grad_norm": 1.4912101030349731, + "learning_rate": 7.185332011892963e-07, + "loss": 0.8448, + "step": 994 + }, + { + "epoch": 1.7736185383244205, + "grad_norm": 1.3264952898025513, + "learning_rate": 7.13577799801784e-07, + "loss": 0.839, + "step": 995 + }, + { + "epoch": 1.7754010695187166, + "grad_norm": 1.4948172569274902, + "learning_rate": 7.086223984142717e-07, + "loss": 0.856, + "step": 996 + }, + { + "epoch": 1.7771836007130126, + "grad_norm": 1.3413145542144775, + "learning_rate": 7.036669970267593e-07, + "loss": 0.8328, + "step": 997 + }, + { + "epoch": 1.7789661319073082, + "grad_norm": 1.6730403900146484, + "learning_rate": 6.987115956392468e-07, + "loss": 0.8902, + "step": 998 + }, + { + "epoch": 1.7807486631016043, + "grad_norm": 1.2651108503341675, + "learning_rate": 6.937561942517344e-07, + "loss": 0.8523, + "step": 999 + }, + { + "epoch": 1.7825311942959001, + "grad_norm": 1.4899260997772217, + "learning_rate": 6.888007928642221e-07, + "loss": 0.8535, + "step": 1000 + }, + { + "epoch": 1.7825311942959001, + "eval_loss": 0.9660161137580872, + "eval_runtime": 46.3938, + "eval_samples_per_second": 21.555, + "eval_steps_per_second": 1.358, + "step": 1000 + }, + { + "epoch": 1.784313725490196, + "grad_norm": 1.3794513940811157, + "learning_rate": 6.838453914767098e-07, + "loss": 0.8702, + "step": 1001 + }, + { + "epoch": 1.786096256684492, + "grad_norm": 1.3967653512954712, + "learning_rate": 6.788899900891972e-07, + "loss": 0.861, + "step": 1002 + }, + { + "epoch": 1.7878787878787878, + "grad_norm": 1.330071210861206, + "learning_rate": 6.739345887016849e-07, + "loss": 0.851, + "step": 1003 + }, + { + "epoch": 1.7896613190730837, + "grad_norm": 1.358818769454956, + "learning_rate": 6.689791873141725e-07, + "loss": 0.8549, + "step": 1004 + }, + { + "epoch": 1.7914438502673797, + "grad_norm": 1.3878623247146606, + "learning_rate": 6.640237859266602e-07, + "loss": 0.8318, + "step": 1005 + }, + { + "epoch": 1.7932263814616756, + "grad_norm": 1.4242222309112549, + "learning_rate": 6.590683845391476e-07, + "loss": 0.8061, + "step": 1006 + }, + { + "epoch": 1.7950089126559714, + "grad_norm": 1.4395697116851807, + "learning_rate": 6.541129831516353e-07, + "loss": 0.8766, + "step": 1007 + }, + { + "epoch": 1.7967914438502675, + "grad_norm": 1.402297854423523, + "learning_rate": 6.49157581764123e-07, + "loss": 0.8798, + "step": 1008 + }, + { + "epoch": 1.7985739750445633, + "grad_norm": 1.275490641593933, + "learning_rate": 6.442021803766106e-07, + "loss": 0.8721, + "step": 1009 + }, + { + "epoch": 1.8003565062388591, + "grad_norm": 1.4236700534820557, + "learning_rate": 6.392467789890981e-07, + "loss": 0.8363, + "step": 1010 + }, + { + "epoch": 1.8003565062388591, + "eval_loss": 0.9658026695251465, + "eval_runtime": 46.4079, + "eval_samples_per_second": 21.548, + "eval_steps_per_second": 1.358, + "step": 1010 + }, + { + "epoch": 1.8021390374331552, + "grad_norm": 1.3970041275024414, + "learning_rate": 6.342913776015857e-07, + "loss": 0.8874, + "step": 1011 + }, + { + "epoch": 1.803921568627451, + "grad_norm": 1.4485187530517578, + "learning_rate": 6.293359762140734e-07, + "loss": 0.8852, + "step": 1012 + }, + { + "epoch": 1.8057040998217468, + "grad_norm": 1.4213767051696777, + "learning_rate": 6.24380574826561e-07, + "loss": 0.8839, + "step": 1013 + }, + { + "epoch": 1.807486631016043, + "grad_norm": 1.3294216394424438, + "learning_rate": 6.194251734390486e-07, + "loss": 0.8104, + "step": 1014 + }, + { + "epoch": 1.8092691622103387, + "grad_norm": 1.532103180885315, + "learning_rate": 6.144697720515362e-07, + "loss": 0.8569, + "step": 1015 + }, + { + "epoch": 1.8110516934046346, + "grad_norm": 1.565508484840393, + "learning_rate": 6.095143706640238e-07, + "loss": 0.8487, + "step": 1016 + }, + { + "epoch": 1.8128342245989306, + "grad_norm": 1.3395127058029175, + "learning_rate": 6.045589692765114e-07, + "loss": 0.8787, + "step": 1017 + }, + { + "epoch": 1.8146167557932262, + "grad_norm": 1.433997631072998, + "learning_rate": 5.996035678889991e-07, + "loss": 0.8642, + "step": 1018 + }, + { + "epoch": 1.8163992869875223, + "grad_norm": 1.4260884523391724, + "learning_rate": 5.946481665014866e-07, + "loss": 0.8388, + "step": 1019 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 1.3741999864578247, + "learning_rate": 5.896927651139743e-07, + "loss": 0.8299, + "step": 1020 + }, + { + "epoch": 1.8181818181818183, + "eval_loss": 0.9656883478164673, + "eval_runtime": 46.3715, + "eval_samples_per_second": 21.565, + "eval_steps_per_second": 1.359, + "step": 1020 + }, + { + "epoch": 1.819964349376114, + "grad_norm": 1.28693687915802, + "learning_rate": 5.847373637264618e-07, + "loss": 0.8359, + "step": 1021 + }, + { + "epoch": 1.82174688057041, + "grad_norm": 1.3816742897033691, + "learning_rate": 5.797819623389495e-07, + "loss": 0.8802, + "step": 1022 + }, + { + "epoch": 1.8235294117647058, + "grad_norm": 1.3577059507369995, + "learning_rate": 5.748265609514372e-07, + "loss": 0.8391, + "step": 1023 + }, + { + "epoch": 1.8253119429590017, + "grad_norm": 1.3769915103912354, + "learning_rate": 5.698711595639247e-07, + "loss": 0.8568, + "step": 1024 + }, + { + "epoch": 1.8270944741532977, + "grad_norm": 1.3751606941223145, + "learning_rate": 5.649157581764124e-07, + "loss": 0.865, + "step": 1025 + }, + { + "epoch": 1.8288770053475936, + "grad_norm": 1.3568124771118164, + "learning_rate": 5.599603567888999e-07, + "loss": 0.8728, + "step": 1026 + }, + { + "epoch": 1.8306595365418894, + "grad_norm": 1.5252009630203247, + "learning_rate": 5.550049554013876e-07, + "loss": 0.8675, + "step": 1027 + }, + { + "epoch": 1.8324420677361855, + "grad_norm": 1.46947181224823, + "learning_rate": 5.500495540138751e-07, + "loss": 0.8545, + "step": 1028 + }, + { + "epoch": 1.8342245989304813, + "grad_norm": 1.430268406867981, + "learning_rate": 5.450941526263628e-07, + "loss": 0.8329, + "step": 1029 + }, + { + "epoch": 1.8360071301247771, + "grad_norm": 1.3437092304229736, + "learning_rate": 5.401387512388505e-07, + "loss": 0.8291, + "step": 1030 + }, + { + "epoch": 1.8360071301247771, + "eval_loss": 0.9675397872924805, + "eval_runtime": 46.4016, + "eval_samples_per_second": 21.551, + "eval_steps_per_second": 1.358, + "step": 1030 + }, + { + "epoch": 1.8377896613190732, + "grad_norm": 1.40646231174469, + "learning_rate": 5.35183349851338e-07, + "loss": 0.8745, + "step": 1031 + }, + { + "epoch": 1.839572192513369, + "grad_norm": 1.4662535190582275, + "learning_rate": 5.302279484638257e-07, + "loss": 0.8494, + "step": 1032 + }, + { + "epoch": 1.8413547237076648, + "grad_norm": 1.535057783126831, + "learning_rate": 5.252725470763132e-07, + "loss": 0.8445, + "step": 1033 + }, + { + "epoch": 1.843137254901961, + "grad_norm": 1.4149518013000488, + "learning_rate": 5.203171456888009e-07, + "loss": 0.8753, + "step": 1034 + }, + { + "epoch": 1.8449197860962567, + "grad_norm": 1.4421366453170776, + "learning_rate": 5.153617443012885e-07, + "loss": 0.8435, + "step": 1035 + }, + { + "epoch": 1.8467023172905526, + "grad_norm": 1.384452223777771, + "learning_rate": 5.104063429137761e-07, + "loss": 0.8768, + "step": 1036 + }, + { + "epoch": 1.8484848484848486, + "grad_norm": 1.4903335571289062, + "learning_rate": 5.054509415262637e-07, + "loss": 0.8501, + "step": 1037 + }, + { + "epoch": 1.8502673796791442, + "grad_norm": 1.3531508445739746, + "learning_rate": 5.004955401387513e-07, + "loss": 0.8837, + "step": 1038 + }, + { + "epoch": 1.8520499108734403, + "grad_norm": 1.486277461051941, + "learning_rate": 4.955401387512389e-07, + "loss": 0.853, + "step": 1039 + }, + { + "epoch": 1.8538324420677363, + "grad_norm": 1.4410507678985596, + "learning_rate": 4.905847373637266e-07, + "loss": 0.8464, + "step": 1040 + }, + { + "epoch": 1.8538324420677363, + "eval_loss": 0.9651731252670288, + "eval_runtime": 46.4455, + "eval_samples_per_second": 21.531, + "eval_steps_per_second": 1.356, + "step": 1040 + }, + { + "epoch": 1.855614973262032, + "grad_norm": 1.268168568611145, + "learning_rate": 4.856293359762141e-07, + "loss": 0.8265, + "step": 1041 + }, + { + "epoch": 1.857397504456328, + "grad_norm": 1.409671664237976, + "learning_rate": 4.806739345887018e-07, + "loss": 0.8374, + "step": 1042 + }, + { + "epoch": 1.8591800356506238, + "grad_norm": 1.6630103588104248, + "learning_rate": 4.7571853320118933e-07, + "loss": 0.8632, + "step": 1043 + }, + { + "epoch": 1.8609625668449197, + "grad_norm": 1.4358583688735962, + "learning_rate": 4.70763131813677e-07, + "loss": 0.8782, + "step": 1044 + }, + { + "epoch": 1.8627450980392157, + "grad_norm": 1.3741272687911987, + "learning_rate": 4.6580773042616455e-07, + "loss": 0.8455, + "step": 1045 + }, + { + "epoch": 1.8645276292335116, + "grad_norm": 1.2778054475784302, + "learning_rate": 4.608523290386522e-07, + "loss": 0.8218, + "step": 1046 + }, + { + "epoch": 1.8663101604278074, + "grad_norm": 1.3522720336914062, + "learning_rate": 4.5589692765113977e-07, + "loss": 0.8633, + "step": 1047 + }, + { + "epoch": 1.8680926916221035, + "grad_norm": 1.49508798122406, + "learning_rate": 4.5094152626362743e-07, + "loss": 0.8746, + "step": 1048 + }, + { + "epoch": 1.8698752228163993, + "grad_norm": 1.3672457933425903, + "learning_rate": 4.45986124876115e-07, + "loss": 0.8559, + "step": 1049 + }, + { + "epoch": 1.8716577540106951, + "grad_norm": 1.4118934869766235, + "learning_rate": 4.4103072348860265e-07, + "loss": 0.8701, + "step": 1050 + }, + { + "epoch": 1.8716577540106951, + "eval_loss": 0.9658851027488708, + "eval_runtime": 46.4763, + "eval_samples_per_second": 21.516, + "eval_steps_per_second": 1.356, + "step": 1050 + }, + { + "epoch": 1.8734402852049912, + "grad_norm": 1.3773859739303589, + "learning_rate": 4.360753221010902e-07, + "loss": 0.871, + "step": 1051 + }, + { + "epoch": 1.875222816399287, + "grad_norm": 1.3667681217193604, + "learning_rate": 4.3111992071357786e-07, + "loss": 0.8678, + "step": 1052 + }, + { + "epoch": 1.8770053475935828, + "grad_norm": 1.4455238580703735, + "learning_rate": 4.261645193260654e-07, + "loss": 0.8475, + "step": 1053 + }, + { + "epoch": 1.878787878787879, + "grad_norm": 1.5423219203948975, + "learning_rate": 4.212091179385531e-07, + "loss": 0.895, + "step": 1054 + }, + { + "epoch": 1.8805704099821747, + "grad_norm": 1.3430986404418945, + "learning_rate": 4.1625371655104064e-07, + "loss": 0.8616, + "step": 1055 + }, + { + "epoch": 1.8823529411764706, + "grad_norm": 1.4237287044525146, + "learning_rate": 4.112983151635283e-07, + "loss": 0.8642, + "step": 1056 + }, + { + "epoch": 1.8841354723707666, + "grad_norm": 1.3155893087387085, + "learning_rate": 4.063429137760159e-07, + "loss": 0.8269, + "step": 1057 + }, + { + "epoch": 1.8859180035650622, + "grad_norm": 1.390587329864502, + "learning_rate": 4.013875123885035e-07, + "loss": 0.8609, + "step": 1058 + }, + { + "epoch": 1.8877005347593583, + "grad_norm": 1.3156342506408691, + "learning_rate": 3.964321110009911e-07, + "loss": 0.8487, + "step": 1059 + }, + { + "epoch": 1.8894830659536543, + "grad_norm": 1.3792142868041992, + "learning_rate": 3.9147670961347873e-07, + "loss": 0.8581, + "step": 1060 + }, + { + "epoch": 1.8894830659536543, + "eval_loss": 0.9655643701553345, + "eval_runtime": 46.4516, + "eval_samples_per_second": 21.528, + "eval_steps_per_second": 1.356, + "step": 1060 + }, + { + "epoch": 1.89126559714795, + "grad_norm": 1.407735824584961, + "learning_rate": 3.8652130822596634e-07, + "loss": 0.8542, + "step": 1061 + }, + { + "epoch": 1.893048128342246, + "grad_norm": 1.2963098287582397, + "learning_rate": 3.8156590683845395e-07, + "loss": 0.8487, + "step": 1062 + }, + { + "epoch": 1.8948306595365418, + "grad_norm": 1.3685920238494873, + "learning_rate": 3.7661050545094156e-07, + "loss": 0.8637, + "step": 1063 + }, + { + "epoch": 1.8966131907308377, + "grad_norm": 1.3968452215194702, + "learning_rate": 3.7165510406342917e-07, + "loss": 0.8487, + "step": 1064 + }, + { + "epoch": 1.8983957219251337, + "grad_norm": 1.358083724975586, + "learning_rate": 3.666997026759168e-07, + "loss": 0.8383, + "step": 1065 + }, + { + "epoch": 1.9001782531194296, + "grad_norm": 1.3680518865585327, + "learning_rate": 3.6174430128840444e-07, + "loss": 0.8727, + "step": 1066 + }, + { + "epoch": 1.9019607843137254, + "grad_norm": 1.481414794921875, + "learning_rate": 3.56788899900892e-07, + "loss": 0.8667, + "step": 1067 + }, + { + "epoch": 1.9037433155080214, + "grad_norm": 1.2826118469238281, + "learning_rate": 3.5183349851337966e-07, + "loss": 0.8814, + "step": 1068 + }, + { + "epoch": 1.9055258467023173, + "grad_norm": 1.2083613872528076, + "learning_rate": 3.468780971258672e-07, + "loss": 0.8351, + "step": 1069 + }, + { + "epoch": 1.9073083778966131, + "grad_norm": 1.3396915197372437, + "learning_rate": 3.419226957383549e-07, + "loss": 0.8462, + "step": 1070 + }, + { + "epoch": 1.9073083778966131, + "eval_loss": 0.9655903577804565, + "eval_runtime": 46.3914, + "eval_samples_per_second": 21.556, + "eval_steps_per_second": 1.358, + "step": 1070 + }, + { + "epoch": 1.9090909090909092, + "grad_norm": 1.2929346561431885, + "learning_rate": 3.3696729435084243e-07, + "loss": 0.8624, + "step": 1071 + }, + { + "epoch": 1.910873440285205, + "grad_norm": 1.4455654621124268, + "learning_rate": 3.320118929633301e-07, + "loss": 0.8553, + "step": 1072 + }, + { + "epoch": 1.9126559714795008, + "grad_norm": 1.4255995750427246, + "learning_rate": 3.2705649157581765e-07, + "loss": 0.8071, + "step": 1073 + }, + { + "epoch": 1.914438502673797, + "grad_norm": 1.3290305137634277, + "learning_rate": 3.221010901883053e-07, + "loss": 0.8566, + "step": 1074 + }, + { + "epoch": 1.9162210338680927, + "grad_norm": 1.3911030292510986, + "learning_rate": 3.1714568880079287e-07, + "loss": 0.8747, + "step": 1075 + }, + { + "epoch": 1.9180035650623886, + "grad_norm": 1.3865312337875366, + "learning_rate": 3.121902874132805e-07, + "loss": 0.854, + "step": 1076 + }, + { + "epoch": 1.9197860962566846, + "grad_norm": 1.332739233970642, + "learning_rate": 3.072348860257681e-07, + "loss": 0.8461, + "step": 1077 + }, + { + "epoch": 1.9215686274509802, + "grad_norm": 1.3937904834747314, + "learning_rate": 3.022794846382557e-07, + "loss": 0.8532, + "step": 1078 + }, + { + "epoch": 1.9233511586452763, + "grad_norm": 1.299120545387268, + "learning_rate": 2.973240832507433e-07, + "loss": 0.8327, + "step": 1079 + }, + { + "epoch": 1.9251336898395723, + "grad_norm": 1.3246283531188965, + "learning_rate": 2.923686818632309e-07, + "loss": 0.8591, + "step": 1080 + }, + { + "epoch": 1.9251336898395723, + "eval_loss": 0.9655869603157043, + "eval_runtime": 46.5255, + "eval_samples_per_second": 21.494, + "eval_steps_per_second": 1.354, + "step": 1080 + }, + { + "epoch": 1.926916221033868, + "grad_norm": 1.4631216526031494, + "learning_rate": 2.874132804757186e-07, + "loss": 0.8395, + "step": 1081 + }, + { + "epoch": 1.928698752228164, + "grad_norm": 1.3864542245864868, + "learning_rate": 2.824578790882062e-07, + "loss": 0.8658, + "step": 1082 + }, + { + "epoch": 1.93048128342246, + "grad_norm": 1.5314630270004272, + "learning_rate": 2.775024777006938e-07, + "loss": 0.8729, + "step": 1083 + }, + { + "epoch": 1.9322638146167557, + "grad_norm": 1.379754662513733, + "learning_rate": 2.725470763131814e-07, + "loss": 0.8533, + "step": 1084 + }, + { + "epoch": 1.9340463458110517, + "grad_norm": 1.5379613637924194, + "learning_rate": 2.67591674925669e-07, + "loss": 0.8733, + "step": 1085 + }, + { + "epoch": 1.9358288770053476, + "grad_norm": 1.3273158073425293, + "learning_rate": 2.626362735381566e-07, + "loss": 0.8661, + "step": 1086 + }, + { + "epoch": 1.9376114081996434, + "grad_norm": 1.4625751972198486, + "learning_rate": 2.5768087215064423e-07, + "loss": 0.8725, + "step": 1087 + }, + { + "epoch": 1.9393939393939394, + "grad_norm": 1.4103567600250244, + "learning_rate": 2.5272547076313184e-07, + "loss": 0.8406, + "step": 1088 + }, + { + "epoch": 1.9411764705882353, + "grad_norm": 1.4515103101730347, + "learning_rate": 2.4777006937561945e-07, + "loss": 0.8759, + "step": 1089 + }, + { + "epoch": 1.9429590017825311, + "grad_norm": 1.4476619958877563, + "learning_rate": 2.4281466798810706e-07, + "loss": 0.8511, + "step": 1090 + }, + { + "epoch": 1.9429590017825311, + "eval_loss": 0.9651136994361877, + "eval_runtime": 46.6353, + "eval_samples_per_second": 21.443, + "eval_steps_per_second": 1.351, + "step": 1090 + }, + { + "epoch": 1.9447415329768272, + "grad_norm": 1.3023127317428589, + "learning_rate": 2.3785926660059467e-07, + "loss": 0.8293, + "step": 1091 + }, + { + "epoch": 1.946524064171123, + "grad_norm": 1.3094813823699951, + "learning_rate": 2.3290386521308227e-07, + "loss": 0.8309, + "step": 1092 + }, + { + "epoch": 1.9483065953654188, + "grad_norm": 1.3473302125930786, + "learning_rate": 2.2794846382556988e-07, + "loss": 0.8577, + "step": 1093 + }, + { + "epoch": 1.950089126559715, + "grad_norm": 1.3242158889770508, + "learning_rate": 2.229930624380575e-07, + "loss": 0.8603, + "step": 1094 + }, + { + "epoch": 1.9518716577540107, + "grad_norm": 1.3417590856552124, + "learning_rate": 2.180376610505451e-07, + "loss": 0.8566, + "step": 1095 + }, + { + "epoch": 1.9536541889483066, + "grad_norm": 1.3229726552963257, + "learning_rate": 2.130822596630327e-07, + "loss": 0.8592, + "step": 1096 + }, + { + "epoch": 1.9554367201426026, + "grad_norm": 1.2532265186309814, + "learning_rate": 2.0812685827552032e-07, + "loss": 0.8184, + "step": 1097 + }, + { + "epoch": 1.9572192513368984, + "grad_norm": 1.352526307106018, + "learning_rate": 2.0317145688800795e-07, + "loss": 0.8401, + "step": 1098 + }, + { + "epoch": 1.9590017825311943, + "grad_norm": 1.3978582620620728, + "learning_rate": 1.9821605550049556e-07, + "loss": 0.8697, + "step": 1099 + }, + { + "epoch": 1.9607843137254903, + "grad_norm": 1.4667596817016602, + "learning_rate": 1.9326065411298317e-07, + "loss": 0.8625, + "step": 1100 + }, + { + "epoch": 1.9607843137254903, + "eval_loss": 0.9646411538124084, + "eval_runtime": 46.485, + "eval_samples_per_second": 21.512, + "eval_steps_per_second": 1.355, + "step": 1100 + }, + { + "epoch": 1.962566844919786, + "grad_norm": 1.5264577865600586, + "learning_rate": 1.8830525272547078e-07, + "loss": 0.8593, + "step": 1101 + }, + { + "epoch": 1.964349376114082, + "grad_norm": 1.3619505167007446, + "learning_rate": 1.833498513379584e-07, + "loss": 0.8339, + "step": 1102 + }, + { + "epoch": 1.966131907308378, + "grad_norm": 1.4145138263702393, + "learning_rate": 1.78394449950446e-07, + "loss": 0.8666, + "step": 1103 + }, + { + "epoch": 1.9679144385026737, + "grad_norm": 1.5150694847106934, + "learning_rate": 1.734390485629336e-07, + "loss": 0.8658, + "step": 1104 + }, + { + "epoch": 1.9696969696969697, + "grad_norm": 1.400248646736145, + "learning_rate": 1.6848364717542122e-07, + "loss": 0.8715, + "step": 1105 + }, + { + "epoch": 1.9714795008912656, + "grad_norm": 1.298861026763916, + "learning_rate": 1.6352824578790883e-07, + "loss": 0.8883, + "step": 1106 + }, + { + "epoch": 1.9732620320855614, + "grad_norm": 1.3148902654647827, + "learning_rate": 1.5857284440039643e-07, + "loss": 0.8369, + "step": 1107 + }, + { + "epoch": 1.9750445632798574, + "grad_norm": 1.4141972064971924, + "learning_rate": 1.5361744301288404e-07, + "loss": 0.8589, + "step": 1108 + }, + { + "epoch": 1.9768270944741533, + "grad_norm": 1.3640660047531128, + "learning_rate": 1.4866204162537165e-07, + "loss": 0.8505, + "step": 1109 + }, + { + "epoch": 1.9786096256684491, + "grad_norm": 1.3286265134811401, + "learning_rate": 1.437066402378593e-07, + "loss": 0.8301, + "step": 1110 + }, + { + "epoch": 1.9786096256684491, + "eval_loss": 0.9635146260261536, + "eval_runtime": 46.687, + "eval_samples_per_second": 21.419, + "eval_steps_per_second": 1.349, + "step": 1110 + }, + { + "epoch": 1.9803921568627452, + "grad_norm": 1.3741545677185059, + "learning_rate": 1.387512388503469e-07, + "loss": 0.8125, + "step": 1111 + }, + { + "epoch": 1.982174688057041, + "grad_norm": 1.324223518371582, + "learning_rate": 1.337958374628345e-07, + "loss": 0.8809, + "step": 1112 + }, + { + "epoch": 1.9839572192513368, + "grad_norm": 1.3497804403305054, + "learning_rate": 1.2884043607532211e-07, + "loss": 0.8602, + "step": 1113 + }, + { + "epoch": 1.985739750445633, + "grad_norm": 1.4052799940109253, + "learning_rate": 1.2388503468780972e-07, + "loss": 0.8791, + "step": 1114 + }, + { + "epoch": 1.9875222816399287, + "grad_norm": 1.344778060913086, + "learning_rate": 1.1892963330029733e-07, + "loss": 0.8234, + "step": 1115 + }, + { + "epoch": 1.9893048128342246, + "grad_norm": 1.3553035259246826, + "learning_rate": 1.1397423191278494e-07, + "loss": 0.8653, + "step": 1116 + }, + { + "epoch": 1.9910873440285206, + "grad_norm": 1.3913415670394897, + "learning_rate": 1.0901883052527255e-07, + "loss": 0.8166, + "step": 1117 + }, + { + "epoch": 1.9928698752228164, + "grad_norm": 1.4442999362945557, + "learning_rate": 1.0406342913776016e-07, + "loss": 0.8955, + "step": 1118 + }, + { + "epoch": 1.9946524064171123, + "grad_norm": 1.413307547569275, + "learning_rate": 9.910802775024778e-08, + "loss": 0.8523, + "step": 1119 + }, + { + "epoch": 1.9964349376114083, + "grad_norm": 1.5048857927322388, + "learning_rate": 9.415262636273539e-08, + "loss": 0.837, + "step": 1120 + }, + { + "epoch": 1.9964349376114083, + "eval_loss": 0.9632958173751831, + "eval_runtime": 46.6855, + "eval_samples_per_second": 21.42, + "eval_steps_per_second": 1.349, + "step": 1120 + }, + { + "epoch": 1.998217468805704, + "grad_norm": 1.2615009546279907, + "learning_rate": 8.9197224975223e-08, + "loss": 0.8477, + "step": 1121 + }, + { + "epoch": 2.0, + "grad_norm": 1.3137235641479492, + "learning_rate": 8.424182358771061e-08, + "loss": 0.8359, + "step": 1122 + } + ], + "logging_steps": 1, + "max_steps": 1122, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.1698030233507594e+18, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +}