diff --git "a/checkpoints/Qwen2.5-3B/babylm_reverse_partial_10M_seed0/runs/checkpoint-1382/trainer_state.json" "b/checkpoints/Qwen2.5-3B/babylm_reverse_partial_10M_seed0/runs/checkpoint-1382/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoints/Qwen2.5-3B/babylm_reverse_partial_10M_seed0/runs/checkpoint-1382/trainer_state.json" @@ -0,0 +1,10811 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 10, + "global_step": 1382, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001447178002894356, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.5884, + "step": 1 + }, + { + "epoch": 0.002894356005788712, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6206, + "step": 2 + }, + { + "epoch": 0.004341534008683068, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6233, + "step": 3 + }, + { + "epoch": 0.005788712011577424, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6143, + "step": 4 + }, + { + "epoch": 0.00723589001447178, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6268, + "step": 5 + }, + { + "epoch": 0.008683068017366137, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6432, + "step": 6 + }, + { + "epoch": 0.010130246020260492, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6307, + "step": 7 + }, + { + "epoch": 0.011577424023154847, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6207, + "step": 8 + }, + { + "epoch": 0.013024602026049204, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6309, + "step": 9 + }, + { + "epoch": 0.01447178002894356, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6136, + "step": 10 + }, + { + "epoch": 0.01447178002894356, + "eval_loss": 1.6420561075210571, + "eval_runtime": 22.7096, + "eval_samples_per_second": 44.034, + "eval_steps_per_second": 2.774, + "step": 10 + }, + { + "epoch": 0.015918958031837915, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6511, + "step": 11 + }, + { + "epoch": 0.017366136034732273, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6452, + "step": 12 + }, + { + "epoch": 0.01881331403762663, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6388, + "step": 13 + }, + { + "epoch": 0.020260492040520984, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 1.6148, + "step": 14 + }, + { + "epoch": 0.02170767004341534, + "grad_norm": 2.1367368698120117, + "learning_rate": 3.597122302158274e-08, + "loss": 1.6534, + "step": 15 + }, + { + "epoch": 0.023154848046309694, + "grad_norm": 2.132082223892212, + "learning_rate": 7.194244604316547e-08, + "loss": 1.6259, + "step": 16 + }, + { + "epoch": 0.024602026049204053, + "grad_norm": 2.1603453159332275, + "learning_rate": 1.0791366906474822e-07, + "loss": 1.624, + "step": 17 + }, + { + "epoch": 0.02604920405209841, + "grad_norm": 2.125673770904541, + "learning_rate": 1.4388489208633095e-07, + "loss": 1.6566, + "step": 18 + }, + { + "epoch": 0.027496382054992764, + "grad_norm": 2.3304903507232666, + "learning_rate": 1.7985611510791368e-07, + "loss": 1.6216, + "step": 19 + }, + { + "epoch": 0.02894356005788712, + "grad_norm": 2.1149847507476807, + "learning_rate": 2.1582733812949643e-07, + "loss": 1.6413, + "step": 20 + }, + { + "epoch": 0.02894356005788712, + "eval_loss": 1.6411408185958862, + "eval_runtime": 22.7542, + "eval_samples_per_second": 43.948, + "eval_steps_per_second": 2.769, + "step": 20 + }, + { + "epoch": 0.030390738060781478, + "grad_norm": 2.266050338745117, + "learning_rate": 2.5179856115107916e-07, + "loss": 1.6375, + "step": 21 + }, + { + "epoch": 0.03183791606367583, + "grad_norm": 2.2931127548217773, + "learning_rate": 2.877697841726619e-07, + "loss": 1.6043, + "step": 22 + }, + { + "epoch": 0.03328509406657019, + "grad_norm": 2.0147247314453125, + "learning_rate": 3.237410071942446e-07, + "loss": 1.611, + "step": 23 + }, + { + "epoch": 0.03473227206946455, + "grad_norm": 2.0441410541534424, + "learning_rate": 3.5971223021582736e-07, + "loss": 1.6524, + "step": 24 + }, + { + "epoch": 0.0361794500723589, + "grad_norm": 1.937894582748413, + "learning_rate": 3.956834532374101e-07, + "loss": 1.6355, + "step": 25 + }, + { + "epoch": 0.03762662807525326, + "grad_norm": 1.7540408372879028, + "learning_rate": 4.3165467625899287e-07, + "loss": 1.5882, + "step": 26 + }, + { + "epoch": 0.03907380607814761, + "grad_norm": 1.5020838975906372, + "learning_rate": 4.676258992805756e-07, + "loss": 1.6115, + "step": 27 + }, + { + "epoch": 0.04052098408104197, + "grad_norm": 1.707319974899292, + "learning_rate": 5.035971223021583e-07, + "loss": 1.6458, + "step": 28 + }, + { + "epoch": 0.041968162083936326, + "grad_norm": 1.410895824432373, + "learning_rate": 5.39568345323741e-07, + "loss": 1.5857, + "step": 29 + }, + { + "epoch": 0.04341534008683068, + "grad_norm": 1.582987904548645, + "learning_rate": 5.755395683453238e-07, + "loss": 1.6063, + "step": 30 + }, + { + "epoch": 0.04341534008683068, + "eval_loss": 1.6158286333084106, + "eval_runtime": 23.25, + "eval_samples_per_second": 43.011, + "eval_steps_per_second": 2.71, + "step": 30 + }, + { + "epoch": 0.04486251808972504, + "grad_norm": 1.1944270133972168, + "learning_rate": 6.115107913669066e-07, + "loss": 1.5846, + "step": 31 + }, + { + "epoch": 0.04630969609261939, + "grad_norm": 1.180711269378662, + "learning_rate": 6.474820143884893e-07, + "loss": 1.6103, + "step": 32 + }, + { + "epoch": 0.04775687409551375, + "grad_norm": 1.2086774110794067, + "learning_rate": 6.83453237410072e-07, + "loss": 1.6071, + "step": 33 + }, + { + "epoch": 0.049204052098408106, + "grad_norm": 1.0953423976898193, + "learning_rate": 7.194244604316547e-07, + "loss": 1.5981, + "step": 34 + }, + { + "epoch": 0.05065123010130246, + "grad_norm": 1.2060003280639648, + "learning_rate": 7.553956834532375e-07, + "loss": 1.578, + "step": 35 + }, + { + "epoch": 0.05209840810419682, + "grad_norm": 1.087763786315918, + "learning_rate": 7.913669064748202e-07, + "loss": 1.57, + "step": 36 + }, + { + "epoch": 0.053545586107091175, + "grad_norm": 1.1784454584121704, + "learning_rate": 8.27338129496403e-07, + "loss": 1.5977, + "step": 37 + }, + { + "epoch": 0.05499276410998553, + "grad_norm": 1.1461832523345947, + "learning_rate": 8.633093525179857e-07, + "loss": 1.5604, + "step": 38 + }, + { + "epoch": 0.056439942112879886, + "grad_norm": 0.856616199016571, + "learning_rate": 8.992805755395684e-07, + "loss": 1.5734, + "step": 39 + }, + { + "epoch": 0.05788712011577424, + "grad_norm": 0.8984797596931458, + "learning_rate": 9.352517985611512e-07, + "loss": 1.5646, + "step": 40 + }, + { + "epoch": 0.05788712011577424, + "eval_loss": 1.584234356880188, + "eval_runtime": 23.6589, + "eval_samples_per_second": 42.267, + "eval_steps_per_second": 2.663, + "step": 40 + }, + { + "epoch": 0.059334298118668596, + "grad_norm": 0.9018377065658569, + "learning_rate": 9.71223021582734e-07, + "loss": 1.5585, + "step": 41 + }, + { + "epoch": 0.060781476121562955, + "grad_norm": 0.9337756633758545, + "learning_rate": 1.0071942446043167e-06, + "loss": 1.577, + "step": 42 + }, + { + "epoch": 0.06222865412445731, + "grad_norm": 0.8454087972640991, + "learning_rate": 1.0431654676258993e-06, + "loss": 1.5961, + "step": 43 + }, + { + "epoch": 0.06367583212735166, + "grad_norm": 0.8920337557792664, + "learning_rate": 1.079136690647482e-06, + "loss": 1.5841, + "step": 44 + }, + { + "epoch": 0.06512301013024602, + "grad_norm": 0.8182032108306885, + "learning_rate": 1.115107913669065e-06, + "loss": 1.5724, + "step": 45 + }, + { + "epoch": 0.06657018813314038, + "grad_norm": 0.7506604194641113, + "learning_rate": 1.1510791366906476e-06, + "loss": 1.5513, + "step": 46 + }, + { + "epoch": 0.06801736613603473, + "grad_norm": 0.7997567057609558, + "learning_rate": 1.1870503597122303e-06, + "loss": 1.574, + "step": 47 + }, + { + "epoch": 0.0694645441389291, + "grad_norm": 0.8706564903259277, + "learning_rate": 1.2230215827338131e-06, + "loss": 1.5593, + "step": 48 + }, + { + "epoch": 0.07091172214182344, + "grad_norm": 0.8254820108413696, + "learning_rate": 1.2589928057553958e-06, + "loss": 1.5467, + "step": 49 + }, + { + "epoch": 0.0723589001447178, + "grad_norm": 0.8404789566993713, + "learning_rate": 1.2949640287769785e-06, + "loss": 1.5185, + "step": 50 + }, + { + "epoch": 0.0723589001447178, + "eval_loss": 1.5468790531158447, + "eval_runtime": 23.8115, + "eval_samples_per_second": 41.996, + "eval_steps_per_second": 2.646, + "step": 50 + }, + { + "epoch": 0.07380607814761216, + "grad_norm": 0.7575317621231079, + "learning_rate": 1.3309352517985614e-06, + "loss": 1.505, + "step": 51 + }, + { + "epoch": 0.07525325615050651, + "grad_norm": 1.001294732093811, + "learning_rate": 1.366906474820144e-06, + "loss": 1.5236, + "step": 52 + }, + { + "epoch": 0.07670043415340087, + "grad_norm": 0.9547874927520752, + "learning_rate": 1.4028776978417265e-06, + "loss": 1.5074, + "step": 53 + }, + { + "epoch": 0.07814761215629522, + "grad_norm": 1.2945491075515747, + "learning_rate": 1.4388489208633094e-06, + "loss": 1.51, + "step": 54 + }, + { + "epoch": 0.07959479015918958, + "grad_norm": 1.208261251449585, + "learning_rate": 1.474820143884892e-06, + "loss": 1.5254, + "step": 55 + }, + { + "epoch": 0.08104196816208394, + "grad_norm": 0.9851283431053162, + "learning_rate": 1.510791366906475e-06, + "loss": 1.4862, + "step": 56 + }, + { + "epoch": 0.0824891461649783, + "grad_norm": 1.0869357585906982, + "learning_rate": 1.5467625899280579e-06, + "loss": 1.4324, + "step": 57 + }, + { + "epoch": 0.08393632416787265, + "grad_norm": 1.1113526821136475, + "learning_rate": 1.5827338129496403e-06, + "loss": 1.4778, + "step": 58 + }, + { + "epoch": 0.085383502170767, + "grad_norm": 1.1976131200790405, + "learning_rate": 1.618705035971223e-06, + "loss": 1.4935, + "step": 59 + }, + { + "epoch": 0.08683068017366136, + "grad_norm": 1.3426886796951294, + "learning_rate": 1.654676258992806e-06, + "loss": 1.4379, + "step": 60 + }, + { + "epoch": 0.08683068017366136, + "eval_loss": 1.4807368516921997, + "eval_runtime": 23.753, + "eval_samples_per_second": 42.1, + "eval_steps_per_second": 2.652, + "step": 60 + }, + { + "epoch": 0.08827785817655572, + "grad_norm": 1.2697292566299438, + "learning_rate": 1.6906474820143886e-06, + "loss": 1.4738, + "step": 61 + }, + { + "epoch": 0.08972503617945007, + "grad_norm": 1.2183927297592163, + "learning_rate": 1.7266187050359715e-06, + "loss": 1.4749, + "step": 62 + }, + { + "epoch": 0.09117221418234443, + "grad_norm": 1.1829055547714233, + "learning_rate": 1.762589928057554e-06, + "loss": 1.4437, + "step": 63 + }, + { + "epoch": 0.09261939218523878, + "grad_norm": 1.189188838005066, + "learning_rate": 1.7985611510791368e-06, + "loss": 1.432, + "step": 64 + }, + { + "epoch": 0.09406657018813314, + "grad_norm": 1.1934256553649902, + "learning_rate": 1.8345323741007195e-06, + "loss": 1.4253, + "step": 65 + }, + { + "epoch": 0.0955137481910275, + "grad_norm": 1.167486310005188, + "learning_rate": 1.8705035971223024e-06, + "loss": 1.4189, + "step": 66 + }, + { + "epoch": 0.09696092619392185, + "grad_norm": 1.1545919179916382, + "learning_rate": 1.906474820143885e-06, + "loss": 1.3932, + "step": 67 + }, + { + "epoch": 0.09840810419681621, + "grad_norm": 1.233695387840271, + "learning_rate": 1.942446043165468e-06, + "loss": 1.3861, + "step": 68 + }, + { + "epoch": 0.09985528219971057, + "grad_norm": 1.1391371488571167, + "learning_rate": 1.9784172661870504e-06, + "loss": 1.4085, + "step": 69 + }, + { + "epoch": 0.10130246020260492, + "grad_norm": 1.0821561813354492, + "learning_rate": 2.0143884892086333e-06, + "loss": 1.3939, + "step": 70 + }, + { + "epoch": 0.10130246020260492, + "eval_loss": 1.4103246927261353, + "eval_runtime": 23.8212, + "eval_samples_per_second": 41.979, + "eval_steps_per_second": 2.645, + "step": 70 + }, + { + "epoch": 0.10274963820549927, + "grad_norm": 1.4297590255737305, + "learning_rate": 2.050359712230216e-06, + "loss": 1.4063, + "step": 71 + }, + { + "epoch": 0.10419681620839363, + "grad_norm": 1.3051462173461914, + "learning_rate": 2.0863309352517987e-06, + "loss": 1.3462, + "step": 72 + }, + { + "epoch": 0.10564399421128799, + "grad_norm": 1.2220996618270874, + "learning_rate": 2.1223021582733816e-06, + "loss": 1.3514, + "step": 73 + }, + { + "epoch": 0.10709117221418235, + "grad_norm": 1.1698566675186157, + "learning_rate": 2.158273381294964e-06, + "loss": 1.3345, + "step": 74 + }, + { + "epoch": 0.1085383502170767, + "grad_norm": 1.2173532247543335, + "learning_rate": 2.194244604316547e-06, + "loss": 1.3085, + "step": 75 + }, + { + "epoch": 0.10998552821997105, + "grad_norm": 1.0702295303344727, + "learning_rate": 2.23021582733813e-06, + "loss": 1.3473, + "step": 76 + }, + { + "epoch": 0.11143270622286541, + "grad_norm": 1.09895658493042, + "learning_rate": 2.2661870503597123e-06, + "loss": 1.3501, + "step": 77 + }, + { + "epoch": 0.11287988422575977, + "grad_norm": 1.128296136856079, + "learning_rate": 2.302158273381295e-06, + "loss": 1.3528, + "step": 78 + }, + { + "epoch": 0.11432706222865413, + "grad_norm": 1.043407917022705, + "learning_rate": 2.3381294964028776e-06, + "loss": 1.3496, + "step": 79 + }, + { + "epoch": 0.11577424023154848, + "grad_norm": 1.294937014579773, + "learning_rate": 2.3741007194244605e-06, + "loss": 1.3229, + "step": 80 + }, + { + "epoch": 0.11577424023154848, + "eval_loss": 1.3516587018966675, + "eval_runtime": 23.8211, + "eval_samples_per_second": 41.98, + "eval_steps_per_second": 2.645, + "step": 80 + }, + { + "epoch": 0.11722141823444283, + "grad_norm": 1.118807077407837, + "learning_rate": 2.4100719424460434e-06, + "loss": 1.281, + "step": 81 + }, + { + "epoch": 0.11866859623733719, + "grad_norm": 1.0621942281723022, + "learning_rate": 2.4460431654676263e-06, + "loss": 1.293, + "step": 82 + }, + { + "epoch": 0.12011577424023155, + "grad_norm": 1.1248623132705688, + "learning_rate": 2.4820143884892088e-06, + "loss": 1.3028, + "step": 83 + }, + { + "epoch": 0.12156295224312591, + "grad_norm": 1.1106863021850586, + "learning_rate": 2.5179856115107916e-06, + "loss": 1.2948, + "step": 84 + }, + { + "epoch": 0.12301013024602026, + "grad_norm": 1.1541526317596436, + "learning_rate": 2.5539568345323745e-06, + "loss": 1.2529, + "step": 85 + }, + { + "epoch": 0.12445730824891461, + "grad_norm": 1.1443675756454468, + "learning_rate": 2.589928057553957e-06, + "loss": 1.2885, + "step": 86 + }, + { + "epoch": 0.12590448625180897, + "grad_norm": 1.1611272096633911, + "learning_rate": 2.6258992805755395e-06, + "loss": 1.2416, + "step": 87 + }, + { + "epoch": 0.12735166425470332, + "grad_norm": 1.2930173873901367, + "learning_rate": 2.6618705035971228e-06, + "loss": 1.2699, + "step": 88 + }, + { + "epoch": 0.1287988422575977, + "grad_norm": 1.0100473165512085, + "learning_rate": 2.6978417266187052e-06, + "loss": 1.2929, + "step": 89 + }, + { + "epoch": 0.13024602026049203, + "grad_norm": 1.0842128992080688, + "learning_rate": 2.733812949640288e-06, + "loss": 1.2713, + "step": 90 + }, + { + "epoch": 0.13024602026049203, + "eval_loss": 1.31693696975708, + "eval_runtime": 23.816, + "eval_samples_per_second": 41.989, + "eval_steps_per_second": 2.645, + "step": 90 + }, + { + "epoch": 0.1316931982633864, + "grad_norm": 1.3509396314620972, + "learning_rate": 2.7697841726618706e-06, + "loss": 1.2864, + "step": 91 + }, + { + "epoch": 0.13314037626628075, + "grad_norm": 1.0161012411117554, + "learning_rate": 2.805755395683453e-06, + "loss": 1.2936, + "step": 92 + }, + { + "epoch": 0.1345875542691751, + "grad_norm": 1.2537583112716675, + "learning_rate": 2.8417266187050364e-06, + "loss": 1.2382, + "step": 93 + }, + { + "epoch": 0.13603473227206947, + "grad_norm": 1.2590771913528442, + "learning_rate": 2.877697841726619e-06, + "loss": 1.2587, + "step": 94 + }, + { + "epoch": 0.13748191027496381, + "grad_norm": 1.07223641872406, + "learning_rate": 2.9136690647482017e-06, + "loss": 1.2585, + "step": 95 + }, + { + "epoch": 0.1389290882778582, + "grad_norm": 1.1650582551956177, + "learning_rate": 2.949640287769784e-06, + "loss": 1.2333, + "step": 96 + }, + { + "epoch": 0.14037626628075253, + "grad_norm": 1.2447892427444458, + "learning_rate": 2.985611510791367e-06, + "loss": 1.2494, + "step": 97 + }, + { + "epoch": 0.14182344428364688, + "grad_norm": 1.1062458753585815, + "learning_rate": 3.02158273381295e-06, + "loss": 1.2253, + "step": 98 + }, + { + "epoch": 0.14327062228654125, + "grad_norm": 1.1102787256240845, + "learning_rate": 3.0575539568345324e-06, + "loss": 1.2692, + "step": 99 + }, + { + "epoch": 0.1447178002894356, + "grad_norm": 1.225022792816162, + "learning_rate": 3.0935251798561158e-06, + "loss": 1.2748, + "step": 100 + }, + { + "epoch": 0.1447178002894356, + "eval_loss": 1.2890573740005493, + "eval_runtime": 23.8766, + "eval_samples_per_second": 41.882, + "eval_steps_per_second": 2.639, + "step": 100 + }, + { + "epoch": 0.14616497829232997, + "grad_norm": 0.9993003010749817, + "learning_rate": 3.1294964028776982e-06, + "loss": 1.2206, + "step": 101 + }, + { + "epoch": 0.1476121562952243, + "grad_norm": 1.3005403280258179, + "learning_rate": 3.1654676258992807e-06, + "loss": 1.2369, + "step": 102 + }, + { + "epoch": 0.14905933429811866, + "grad_norm": 1.1737979650497437, + "learning_rate": 3.2014388489208636e-06, + "loss": 1.2027, + "step": 103 + }, + { + "epoch": 0.15050651230101303, + "grad_norm": 0.9585458040237427, + "learning_rate": 3.237410071942446e-06, + "loss": 1.2266, + "step": 104 + }, + { + "epoch": 0.15195369030390737, + "grad_norm": 1.2383979558944702, + "learning_rate": 3.2733812949640294e-06, + "loss": 1.2176, + "step": 105 + }, + { + "epoch": 0.15340086830680175, + "grad_norm": 1.3115565776824951, + "learning_rate": 3.309352517985612e-06, + "loss": 1.2053, + "step": 106 + }, + { + "epoch": 0.1548480463096961, + "grad_norm": 1.0333821773529053, + "learning_rate": 3.3453237410071943e-06, + "loss": 1.2167, + "step": 107 + }, + { + "epoch": 0.15629522431259044, + "grad_norm": 1.2587051391601562, + "learning_rate": 3.381294964028777e-06, + "loss": 1.2075, + "step": 108 + }, + { + "epoch": 0.1577424023154848, + "grad_norm": 1.0813391208648682, + "learning_rate": 3.4172661870503596e-06, + "loss": 1.2166, + "step": 109 + }, + { + "epoch": 0.15918958031837915, + "grad_norm": 1.2375551462173462, + "learning_rate": 3.453237410071943e-06, + "loss": 1.1718, + "step": 110 + }, + { + "epoch": 0.15918958031837915, + "eval_loss": 1.2689507007598877, + "eval_runtime": 23.685, + "eval_samples_per_second": 42.221, + "eval_steps_per_second": 2.66, + "step": 110 + }, + { + "epoch": 0.16063675832127353, + "grad_norm": 1.4196252822875977, + "learning_rate": 3.4892086330935254e-06, + "loss": 1.2371, + "step": 111 + }, + { + "epoch": 0.16208393632416787, + "grad_norm": 1.3492250442504883, + "learning_rate": 3.525179856115108e-06, + "loss": 1.236, + "step": 112 + }, + { + "epoch": 0.16353111432706222, + "grad_norm": 1.6325008869171143, + "learning_rate": 3.561151079136691e-06, + "loss": 1.2143, + "step": 113 + }, + { + "epoch": 0.1649782923299566, + "grad_norm": 1.3770588636398315, + "learning_rate": 3.5971223021582737e-06, + "loss": 1.2054, + "step": 114 + }, + { + "epoch": 0.16642547033285093, + "grad_norm": 1.399641990661621, + "learning_rate": 3.6330935251798566e-06, + "loss": 1.1971, + "step": 115 + }, + { + "epoch": 0.1678726483357453, + "grad_norm": 1.4306966066360474, + "learning_rate": 3.669064748201439e-06, + "loss": 1.1692, + "step": 116 + }, + { + "epoch": 0.16931982633863965, + "grad_norm": 1.5498414039611816, + "learning_rate": 3.7050359712230215e-06, + "loss": 1.2157, + "step": 117 + }, + { + "epoch": 0.170767004341534, + "grad_norm": 1.4300094842910767, + "learning_rate": 3.741007194244605e-06, + "loss": 1.1834, + "step": 118 + }, + { + "epoch": 0.17221418234442837, + "grad_norm": 1.3282134532928467, + "learning_rate": 3.7769784172661873e-06, + "loss": 1.2023, + "step": 119 + }, + { + "epoch": 0.1736613603473227, + "grad_norm": 1.359910011291504, + "learning_rate": 3.81294964028777e-06, + "loss": 1.187, + "step": 120 + }, + { + "epoch": 0.1736613603473227, + "eval_loss": 1.2511125802993774, + "eval_runtime": 23.7396, + "eval_samples_per_second": 42.124, + "eval_steps_per_second": 2.654, + "step": 120 + }, + { + "epoch": 0.17510853835021709, + "grad_norm": 1.0847463607788086, + "learning_rate": 3.848920863309353e-06, + "loss": 1.1777, + "step": 121 + }, + { + "epoch": 0.17655571635311143, + "grad_norm": 1.3984366655349731, + "learning_rate": 3.884892086330936e-06, + "loss": 1.1902, + "step": 122 + }, + { + "epoch": 0.17800289435600578, + "grad_norm": 1.1357507705688477, + "learning_rate": 3.920863309352518e-06, + "loss": 1.1577, + "step": 123 + }, + { + "epoch": 0.17945007235890015, + "grad_norm": 1.268511176109314, + "learning_rate": 3.956834532374101e-06, + "loss": 1.1803, + "step": 124 + }, + { + "epoch": 0.1808972503617945, + "grad_norm": 1.2954503297805786, + "learning_rate": 3.992805755395684e-06, + "loss": 1.2001, + "step": 125 + }, + { + "epoch": 0.18234442836468887, + "grad_norm": 1.3166769742965698, + "learning_rate": 4.028776978417267e-06, + "loss": 1.2016, + "step": 126 + }, + { + "epoch": 0.1837916063675832, + "grad_norm": 1.3556228876113892, + "learning_rate": 4.0647482014388495e-06, + "loss": 1.2004, + "step": 127 + }, + { + "epoch": 0.18523878437047755, + "grad_norm": 1.3515830039978027, + "learning_rate": 4.100719424460432e-06, + "loss": 1.1756, + "step": 128 + }, + { + "epoch": 0.18668596237337193, + "grad_norm": 1.4013315439224243, + "learning_rate": 4.1366906474820145e-06, + "loss": 1.181, + "step": 129 + }, + { + "epoch": 0.18813314037626627, + "grad_norm": 1.397181510925293, + "learning_rate": 4.172661870503597e-06, + "loss": 1.1739, + "step": 130 + }, + { + "epoch": 0.18813314037626627, + "eval_loss": 1.2341892719268799, + "eval_runtime": 23.7266, + "eval_samples_per_second": 42.147, + "eval_steps_per_second": 2.655, + "step": 130 + }, + { + "epoch": 0.18958031837916064, + "grad_norm": 1.2460709810256958, + "learning_rate": 4.20863309352518e-06, + "loss": 1.2026, + "step": 131 + }, + { + "epoch": 0.191027496382055, + "grad_norm": 1.1808323860168457, + "learning_rate": 4.244604316546763e-06, + "loss": 1.1644, + "step": 132 + }, + { + "epoch": 0.19247467438494936, + "grad_norm": 1.8333810567855835, + "learning_rate": 4.280575539568346e-06, + "loss": 1.2147, + "step": 133 + }, + { + "epoch": 0.1939218523878437, + "grad_norm": 1.2003754377365112, + "learning_rate": 4.316546762589928e-06, + "loss": 1.1991, + "step": 134 + }, + { + "epoch": 0.19536903039073805, + "grad_norm": 1.497824788093567, + "learning_rate": 4.352517985611511e-06, + "loss": 1.1799, + "step": 135 + }, + { + "epoch": 0.19681620839363242, + "grad_norm": 1.2876054048538208, + "learning_rate": 4.388489208633094e-06, + "loss": 1.1647, + "step": 136 + }, + { + "epoch": 0.19826338639652677, + "grad_norm": 1.514689564704895, + "learning_rate": 4.424460431654677e-06, + "loss": 1.1541, + "step": 137 + }, + { + "epoch": 0.19971056439942114, + "grad_norm": 1.2428678274154663, + "learning_rate": 4.46043165467626e-06, + "loss": 1.1365, + "step": 138 + }, + { + "epoch": 0.2011577424023155, + "grad_norm": 1.448983073234558, + "learning_rate": 4.496402877697842e-06, + "loss": 1.176, + "step": 139 + }, + { + "epoch": 0.20260492040520983, + "grad_norm": 1.2419850826263428, + "learning_rate": 4.5323741007194245e-06, + "loss": 1.1666, + "step": 140 + }, + { + "epoch": 0.20260492040520983, + "eval_loss": 1.2239601612091064, + "eval_runtime": 23.7675, + "eval_samples_per_second": 42.074, + "eval_steps_per_second": 2.651, + "step": 140 + }, + { + "epoch": 0.2040520984081042, + "grad_norm": 1.2841014862060547, + "learning_rate": 4.5683453237410074e-06, + "loss": 1.1375, + "step": 141 + }, + { + "epoch": 0.20549927641099855, + "grad_norm": 1.2060447931289673, + "learning_rate": 4.60431654676259e-06, + "loss": 1.1578, + "step": 142 + }, + { + "epoch": 0.20694645441389292, + "grad_norm": 1.4487413167953491, + "learning_rate": 4.640287769784173e-06, + "loss": 1.1589, + "step": 143 + }, + { + "epoch": 0.20839363241678727, + "grad_norm": 1.3420909643173218, + "learning_rate": 4.676258992805755e-06, + "loss": 1.1463, + "step": 144 + }, + { + "epoch": 0.2098408104196816, + "grad_norm": 1.2916061878204346, + "learning_rate": 4.712230215827339e-06, + "loss": 1.1705, + "step": 145 + }, + { + "epoch": 0.21128798842257598, + "grad_norm": 1.3262569904327393, + "learning_rate": 4.748201438848921e-06, + "loss": 1.158, + "step": 146 + }, + { + "epoch": 0.21273516642547033, + "grad_norm": 1.5340204238891602, + "learning_rate": 4.784172661870504e-06, + "loss": 1.1118, + "step": 147 + }, + { + "epoch": 0.2141823444283647, + "grad_norm": 1.5629545450210571, + "learning_rate": 4.820143884892087e-06, + "loss": 1.1641, + "step": 148 + }, + { + "epoch": 0.21562952243125905, + "grad_norm": 1.1279692649841309, + "learning_rate": 4.856115107913669e-06, + "loss": 1.1477, + "step": 149 + }, + { + "epoch": 0.2170767004341534, + "grad_norm": 1.3184373378753662, + "learning_rate": 4.892086330935253e-06, + "loss": 1.1667, + "step": 150 + }, + { + "epoch": 0.2170767004341534, + "eval_loss": 1.210039496421814, + "eval_runtime": 23.7509, + "eval_samples_per_second": 42.104, + "eval_steps_per_second": 2.653, + "step": 150 + }, + { + "epoch": 0.21852387843704776, + "grad_norm": 1.189042091369629, + "learning_rate": 4.928057553956835e-06, + "loss": 1.156, + "step": 151 + }, + { + "epoch": 0.2199710564399421, + "grad_norm": 1.3684828281402588, + "learning_rate": 4.9640287769784175e-06, + "loss": 1.1006, + "step": 152 + }, + { + "epoch": 0.22141823444283648, + "grad_norm": 1.2458510398864746, + "learning_rate": 5e-06, + "loss": 1.1249, + "step": 153 + }, + { + "epoch": 0.22286541244573083, + "grad_norm": 1.3158795833587646, + "learning_rate": 4.9959774738535805e-06, + "loss": 1.1405, + "step": 154 + }, + { + "epoch": 0.22431259044862517, + "grad_norm": 1.3048667907714844, + "learning_rate": 4.991954947707161e-06, + "loss": 1.1185, + "step": 155 + }, + { + "epoch": 0.22575976845151954, + "grad_norm": 1.3530330657958984, + "learning_rate": 4.987932421560741e-06, + "loss": 1.1554, + "step": 156 + }, + { + "epoch": 0.2272069464544139, + "grad_norm": 1.1434491872787476, + "learning_rate": 4.983909895414321e-06, + "loss": 1.1557, + "step": 157 + }, + { + "epoch": 0.22865412445730826, + "grad_norm": 1.2167620658874512, + "learning_rate": 4.9798873692679e-06, + "loss": 1.099, + "step": 158 + }, + { + "epoch": 0.2301013024602026, + "grad_norm": 1.4193692207336426, + "learning_rate": 4.97586484312148e-06, + "loss": 1.1281, + "step": 159 + }, + { + "epoch": 0.23154848046309695, + "grad_norm": 1.342673420906067, + "learning_rate": 4.9718423169750605e-06, + "loss": 1.1605, + "step": 160 + }, + { + "epoch": 0.23154848046309695, + "eval_loss": 1.2034168243408203, + "eval_runtime": 23.6534, + "eval_samples_per_second": 42.277, + "eval_steps_per_second": 2.663, + "step": 160 + }, + { + "epoch": 0.23299565846599132, + "grad_norm": 1.1687936782836914, + "learning_rate": 4.967819790828641e-06, + "loss": 1.1303, + "step": 161 + }, + { + "epoch": 0.23444283646888567, + "grad_norm": 1.6776436567306519, + "learning_rate": 4.963797264682221e-06, + "loss": 1.0999, + "step": 162 + }, + { + "epoch": 0.23589001447178004, + "grad_norm": 1.4628556966781616, + "learning_rate": 4.959774738535801e-06, + "loss": 1.1251, + "step": 163 + }, + { + "epoch": 0.23733719247467439, + "grad_norm": 1.5022996664047241, + "learning_rate": 4.955752212389381e-06, + "loss": 1.1313, + "step": 164 + }, + { + "epoch": 0.23878437047756873, + "grad_norm": 1.4081605672836304, + "learning_rate": 4.951729686242961e-06, + "loss": 1.086, + "step": 165 + }, + { + "epoch": 0.2402315484804631, + "grad_norm": 1.4371721744537354, + "learning_rate": 4.947707160096541e-06, + "loss": 1.1371, + "step": 166 + }, + { + "epoch": 0.24167872648335745, + "grad_norm": 1.4459543228149414, + "learning_rate": 4.943684633950121e-06, + "loss": 1.1065, + "step": 167 + }, + { + "epoch": 0.24312590448625182, + "grad_norm": 1.4168459177017212, + "learning_rate": 4.939662107803701e-06, + "loss": 1.132, + "step": 168 + }, + { + "epoch": 0.24457308248914617, + "grad_norm": 1.166857123374939, + "learning_rate": 4.935639581657281e-06, + "loss": 1.0984, + "step": 169 + }, + { + "epoch": 0.2460202604920405, + "grad_norm": 1.256256341934204, + "learning_rate": 4.931617055510861e-06, + "loss": 1.1402, + "step": 170 + }, + { + "epoch": 0.2460202604920405, + "eval_loss": 1.1935114860534668, + "eval_runtime": 23.7705, + "eval_samples_per_second": 42.069, + "eval_steps_per_second": 2.65, + "step": 170 + }, + { + "epoch": 0.24746743849493488, + "grad_norm": 1.4134575128555298, + "learning_rate": 4.927594529364441e-06, + "loss": 1.1455, + "step": 171 + }, + { + "epoch": 0.24891461649782923, + "grad_norm": 1.2584257125854492, + "learning_rate": 4.923572003218021e-06, + "loss": 1.1159, + "step": 172 + }, + { + "epoch": 0.2503617945007236, + "grad_norm": 1.1756311655044556, + "learning_rate": 4.919549477071601e-06, + "loss": 1.0872, + "step": 173 + }, + { + "epoch": 0.25180897250361794, + "grad_norm": 1.2628989219665527, + "learning_rate": 4.915526950925181e-06, + "loss": 1.1047, + "step": 174 + }, + { + "epoch": 0.2532561505065123, + "grad_norm": 1.250351905822754, + "learning_rate": 4.9115044247787615e-06, + "loss": 1.1189, + "step": 175 + }, + { + "epoch": 0.25470332850940663, + "grad_norm": 1.3086153268814087, + "learning_rate": 4.907481898632342e-06, + "loss": 1.098, + "step": 176 + }, + { + "epoch": 0.25615050651230103, + "grad_norm": 1.1292258501052856, + "learning_rate": 4.903459372485922e-06, + "loss": 1.1536, + "step": 177 + }, + { + "epoch": 0.2575976845151954, + "grad_norm": 1.1963332891464233, + "learning_rate": 4.899436846339501e-06, + "loss": 1.1016, + "step": 178 + }, + { + "epoch": 0.2590448625180897, + "grad_norm": 1.315817952156067, + "learning_rate": 4.895414320193081e-06, + "loss": 1.109, + "step": 179 + }, + { + "epoch": 0.26049204052098407, + "grad_norm": 1.338917851448059, + "learning_rate": 4.891391794046661e-06, + "loss": 1.126, + "step": 180 + }, + { + "epoch": 0.26049204052098407, + "eval_loss": 1.1867763996124268, + "eval_runtime": 23.7561, + "eval_samples_per_second": 42.094, + "eval_steps_per_second": 2.652, + "step": 180 + }, + { + "epoch": 0.2619392185238784, + "grad_norm": 1.607206106185913, + "learning_rate": 4.8873692679002414e-06, + "loss": 1.1327, + "step": 181 + }, + { + "epoch": 0.2633863965267728, + "grad_norm": 1.2567116022109985, + "learning_rate": 4.8833467417538216e-06, + "loss": 1.1111, + "step": 182 + }, + { + "epoch": 0.26483357452966716, + "grad_norm": 1.5325278043746948, + "learning_rate": 4.879324215607402e-06, + "loss": 1.1234, + "step": 183 + }, + { + "epoch": 0.2662807525325615, + "grad_norm": 1.313176155090332, + "learning_rate": 4.875301689460982e-06, + "loss": 1.0929, + "step": 184 + }, + { + "epoch": 0.26772793053545585, + "grad_norm": 1.4401973485946655, + "learning_rate": 4.871279163314562e-06, + "loss": 1.1257, + "step": 185 + }, + { + "epoch": 0.2691751085383502, + "grad_norm": 1.4455851316452026, + "learning_rate": 4.867256637168142e-06, + "loss": 1.1069, + "step": 186 + }, + { + "epoch": 0.2706222865412446, + "grad_norm": 1.4538307189941406, + "learning_rate": 4.863234111021722e-06, + "loss": 1.0882, + "step": 187 + }, + { + "epoch": 0.27206946454413894, + "grad_norm": 1.5298668146133423, + "learning_rate": 4.8592115848753015e-06, + "loss": 1.1007, + "step": 188 + }, + { + "epoch": 0.2735166425470333, + "grad_norm": 1.577487826347351, + "learning_rate": 4.855189058728882e-06, + "loss": 1.1062, + "step": 189 + }, + { + "epoch": 0.27496382054992763, + "grad_norm": 1.3045742511749268, + "learning_rate": 4.851166532582462e-06, + "loss": 1.0865, + "step": 190 + }, + { + "epoch": 0.27496382054992763, + "eval_loss": 1.180038571357727, + "eval_runtime": 23.7442, + "eval_samples_per_second": 42.116, + "eval_steps_per_second": 2.653, + "step": 190 + }, + { + "epoch": 0.276410998552822, + "grad_norm": 1.6419295072555542, + "learning_rate": 4.847144006436042e-06, + "loss": 1.1279, + "step": 191 + }, + { + "epoch": 0.2778581765557164, + "grad_norm": 1.2489582300186157, + "learning_rate": 4.843121480289622e-06, + "loss": 1.0574, + "step": 192 + }, + { + "epoch": 0.2793053545586107, + "grad_norm": 1.3978028297424316, + "learning_rate": 4.839098954143202e-06, + "loss": 1.0683, + "step": 193 + }, + { + "epoch": 0.28075253256150506, + "grad_norm": 1.3827496767044067, + "learning_rate": 4.835076427996782e-06, + "loss": 1.0885, + "step": 194 + }, + { + "epoch": 0.2821997105643994, + "grad_norm": 1.1835651397705078, + "learning_rate": 4.831053901850362e-06, + "loss": 1.1322, + "step": 195 + }, + { + "epoch": 0.28364688856729375, + "grad_norm": 1.428334355354309, + "learning_rate": 4.8270313757039425e-06, + "loss": 1.1025, + "step": 196 + }, + { + "epoch": 0.28509406657018815, + "grad_norm": 1.187781810760498, + "learning_rate": 4.823008849557523e-06, + "loss": 1.0816, + "step": 197 + }, + { + "epoch": 0.2865412445730825, + "grad_norm": 1.344883680343628, + "learning_rate": 4.818986323411103e-06, + "loss": 1.0836, + "step": 198 + }, + { + "epoch": 0.28798842257597684, + "grad_norm": 1.6049326658248901, + "learning_rate": 4.814963797264683e-06, + "loss": 1.0859, + "step": 199 + }, + { + "epoch": 0.2894356005788712, + "grad_norm": 1.3415390253067017, + "learning_rate": 4.810941271118263e-06, + "loss": 1.076, + "step": 200 + }, + { + "epoch": 0.2894356005788712, + "eval_loss": 1.1744095087051392, + "eval_runtime": 23.7425, + "eval_samples_per_second": 42.119, + "eval_steps_per_second": 2.653, + "step": 200 + }, + { + "epoch": 0.29088277858176553, + "grad_norm": 1.406237244606018, + "learning_rate": 4.806918744971843e-06, + "loss": 1.0796, + "step": 201 + }, + { + "epoch": 0.29232995658465993, + "grad_norm": 1.1777249574661255, + "learning_rate": 4.802896218825423e-06, + "loss": 1.0837, + "step": 202 + }, + { + "epoch": 0.2937771345875543, + "grad_norm": 1.2819619178771973, + "learning_rate": 4.798873692679003e-06, + "loss": 1.1158, + "step": 203 + }, + { + "epoch": 0.2952243125904486, + "grad_norm": 1.2678483724594116, + "learning_rate": 4.794851166532583e-06, + "loss": 1.1561, + "step": 204 + }, + { + "epoch": 0.29667149059334297, + "grad_norm": 1.2497974634170532, + "learning_rate": 4.790828640386163e-06, + "loss": 1.132, + "step": 205 + }, + { + "epoch": 0.2981186685962373, + "grad_norm": 1.3484586477279663, + "learning_rate": 4.786806114239743e-06, + "loss": 1.0981, + "step": 206 + }, + { + "epoch": 0.2995658465991317, + "grad_norm": 1.3319278955459595, + "learning_rate": 4.782783588093323e-06, + "loss": 1.1202, + "step": 207 + }, + { + "epoch": 0.30101302460202606, + "grad_norm": 1.3807053565979004, + "learning_rate": 4.778761061946903e-06, + "loss": 1.0639, + "step": 208 + }, + { + "epoch": 0.3024602026049204, + "grad_norm": 1.6423574686050415, + "learning_rate": 4.774738535800483e-06, + "loss": 1.099, + "step": 209 + }, + { + "epoch": 0.30390738060781475, + "grad_norm": 1.2813725471496582, + "learning_rate": 4.7707160096540635e-06, + "loss": 1.0722, + "step": 210 + }, + { + "epoch": 0.30390738060781475, + "eval_loss": 1.1650046110153198, + "eval_runtime": 23.7271, + "eval_samples_per_second": 42.146, + "eval_steps_per_second": 2.655, + "step": 210 + }, + { + "epoch": 0.3053545586107091, + "grad_norm": 1.222902774810791, + "learning_rate": 4.766693483507644e-06, + "loss": 1.0926, + "step": 211 + }, + { + "epoch": 0.3068017366136035, + "grad_norm": 1.3555961847305298, + "learning_rate": 4.762670957361224e-06, + "loss": 1.1023, + "step": 212 + }, + { + "epoch": 0.30824891461649784, + "grad_norm": 1.2937288284301758, + "learning_rate": 4.758648431214804e-06, + "loss": 1.0815, + "step": 213 + }, + { + "epoch": 0.3096960926193922, + "grad_norm": 1.535508632659912, + "learning_rate": 4.754625905068383e-06, + "loss": 1.1259, + "step": 214 + }, + { + "epoch": 0.3111432706222865, + "grad_norm": 1.2665668725967407, + "learning_rate": 4.750603378921963e-06, + "loss": 1.0961, + "step": 215 + }, + { + "epoch": 0.3125904486251809, + "grad_norm": 1.4438718557357788, + "learning_rate": 4.746580852775543e-06, + "loss": 1.0958, + "step": 216 + }, + { + "epoch": 0.3140376266280753, + "grad_norm": 1.2452501058578491, + "learning_rate": 4.7425583266291235e-06, + "loss": 1.1098, + "step": 217 + }, + { + "epoch": 0.3154848046309696, + "grad_norm": 1.3459973335266113, + "learning_rate": 4.738535800482704e-06, + "loss": 1.0929, + "step": 218 + }, + { + "epoch": 0.31693198263386396, + "grad_norm": 1.4809298515319824, + "learning_rate": 4.734513274336284e-06, + "loss": 1.089, + "step": 219 + }, + { + "epoch": 0.3183791606367583, + "grad_norm": 1.2082918882369995, + "learning_rate": 4.730490748189864e-06, + "loss": 1.0658, + "step": 220 + }, + { + "epoch": 0.3183791606367583, + "eval_loss": 1.1600922346115112, + "eval_runtime": 23.8112, + "eval_samples_per_second": 41.997, + "eval_steps_per_second": 2.646, + "step": 220 + }, + { + "epoch": 0.31982633863965265, + "grad_norm": 1.3814849853515625, + "learning_rate": 4.726468222043444e-06, + "loss": 1.0736, + "step": 221 + }, + { + "epoch": 0.32127351664254705, + "grad_norm": 1.4855375289916992, + "learning_rate": 4.722445695897024e-06, + "loss": 1.0937, + "step": 222 + }, + { + "epoch": 0.3227206946454414, + "grad_norm": 1.3563365936279297, + "learning_rate": 4.718423169750604e-06, + "loss": 1.1111, + "step": 223 + }, + { + "epoch": 0.32416787264833574, + "grad_norm": 1.388593316078186, + "learning_rate": 4.7144006436041835e-06, + "loss": 1.0409, + "step": 224 + }, + { + "epoch": 0.3256150506512301, + "grad_norm": 1.2511292695999146, + "learning_rate": 4.710378117457764e-06, + "loss": 1.0407, + "step": 225 + }, + { + "epoch": 0.32706222865412443, + "grad_norm": 1.2867145538330078, + "learning_rate": 4.706355591311344e-06, + "loss": 1.069, + "step": 226 + }, + { + "epoch": 0.32850940665701883, + "grad_norm": 1.2984319925308228, + "learning_rate": 4.702333065164924e-06, + "loss": 1.075, + "step": 227 + }, + { + "epoch": 0.3299565846599132, + "grad_norm": 1.3587323427200317, + "learning_rate": 4.698310539018504e-06, + "loss": 1.0816, + "step": 228 + }, + { + "epoch": 0.3314037626628075, + "grad_norm": 1.3507968187332153, + "learning_rate": 4.694288012872084e-06, + "loss": 1.0526, + "step": 229 + }, + { + "epoch": 0.33285094066570187, + "grad_norm": 1.2984371185302734, + "learning_rate": 4.690265486725664e-06, + "loss": 1.0931, + "step": 230 + }, + { + "epoch": 0.33285094066570187, + "eval_loss": 1.1575669050216675, + "eval_runtime": 23.721, + "eval_samples_per_second": 42.157, + "eval_steps_per_second": 2.656, + "step": 230 + }, + { + "epoch": 0.3342981186685962, + "grad_norm": 1.4830766916275024, + "learning_rate": 4.6862429605792444e-06, + "loss": 1.0719, + "step": 231 + }, + { + "epoch": 0.3357452966714906, + "grad_norm": 1.3004618883132935, + "learning_rate": 4.6822204344328246e-06, + "loss": 1.0693, + "step": 232 + }, + { + "epoch": 0.33719247467438496, + "grad_norm": 1.710830807685852, + "learning_rate": 4.678197908286405e-06, + "loss": 1.0905, + "step": 233 + }, + { + "epoch": 0.3386396526772793, + "grad_norm": 1.1909985542297363, + "learning_rate": 4.674175382139984e-06, + "loss": 1.0817, + "step": 234 + }, + { + "epoch": 0.34008683068017365, + "grad_norm": 1.1803637742996216, + "learning_rate": 4.670152855993564e-06, + "loss": 1.1276, + "step": 235 + }, + { + "epoch": 0.341534008683068, + "grad_norm": 1.4538116455078125, + "learning_rate": 4.666130329847144e-06, + "loss": 1.0998, + "step": 236 + }, + { + "epoch": 0.3429811866859624, + "grad_norm": 1.263163685798645, + "learning_rate": 4.662107803700724e-06, + "loss": 1.0904, + "step": 237 + }, + { + "epoch": 0.34442836468885674, + "grad_norm": 1.5513380765914917, + "learning_rate": 4.6580852775543045e-06, + "loss": 1.106, + "step": 238 + }, + { + "epoch": 0.3458755426917511, + "grad_norm": 1.2663887739181519, + "learning_rate": 4.654062751407885e-06, + "loss": 1.0864, + "step": 239 + }, + { + "epoch": 0.3473227206946454, + "grad_norm": 1.3391340970993042, + "learning_rate": 4.650040225261465e-06, + "loss": 1.0603, + "step": 240 + }, + { + "epoch": 0.3473227206946454, + "eval_loss": 1.149273157119751, + "eval_runtime": 23.5467, + "eval_samples_per_second": 42.469, + "eval_steps_per_second": 2.676, + "step": 240 + }, + { + "epoch": 0.34876989869753977, + "grad_norm": 1.3855514526367188, + "learning_rate": 4.646017699115045e-06, + "loss": 1.0694, + "step": 241 + }, + { + "epoch": 0.35021707670043417, + "grad_norm": 1.4039252996444702, + "learning_rate": 4.641995172968625e-06, + "loss": 1.0381, + "step": 242 + }, + { + "epoch": 0.3516642547033285, + "grad_norm": 1.3421276807785034, + "learning_rate": 4.637972646822205e-06, + "loss": 1.0829, + "step": 243 + }, + { + "epoch": 0.35311143270622286, + "grad_norm": 1.5521053075790405, + "learning_rate": 4.633950120675784e-06, + "loss": 1.0786, + "step": 244 + }, + { + "epoch": 0.3545586107091172, + "grad_norm": 1.444671630859375, + "learning_rate": 4.6299275945293645e-06, + "loss": 1.0856, + "step": 245 + }, + { + "epoch": 0.35600578871201155, + "grad_norm": 1.412002444267273, + "learning_rate": 4.625905068382945e-06, + "loss": 1.0747, + "step": 246 + }, + { + "epoch": 0.35745296671490595, + "grad_norm": 1.3790333271026611, + "learning_rate": 4.621882542236525e-06, + "loss": 1.0803, + "step": 247 + }, + { + "epoch": 0.3589001447178003, + "grad_norm": 1.545931100845337, + "learning_rate": 4.617860016090105e-06, + "loss": 1.075, + "step": 248 + }, + { + "epoch": 0.36034732272069464, + "grad_norm": 1.469652771949768, + "learning_rate": 4.613837489943685e-06, + "loss": 1.0941, + "step": 249 + }, + { + "epoch": 0.361794500723589, + "grad_norm": 1.3562983274459839, + "learning_rate": 4.609814963797265e-06, + "loss": 1.0483, + "step": 250 + }, + { + "epoch": 0.361794500723589, + "eval_loss": 1.1459516286849976, + "eval_runtime": 23.6871, + "eval_samples_per_second": 42.217, + "eval_steps_per_second": 2.66, + "step": 250 + }, + { + "epoch": 0.36324167872648333, + "grad_norm": 1.5659211874008179, + "learning_rate": 4.605792437650845e-06, + "loss": 1.0808, + "step": 251 + }, + { + "epoch": 0.36468885672937773, + "grad_norm": 1.2934436798095703, + "learning_rate": 4.6017699115044254e-06, + "loss": 1.0607, + "step": 252 + }, + { + "epoch": 0.3661360347322721, + "grad_norm": 1.2306584119796753, + "learning_rate": 4.5977473853580056e-06, + "loss": 1.0463, + "step": 253 + }, + { + "epoch": 0.3675832127351664, + "grad_norm": 1.3910349607467651, + "learning_rate": 4.593724859211585e-06, + "loss": 1.0701, + "step": 254 + }, + { + "epoch": 0.36903039073806077, + "grad_norm": 1.1596152782440186, + "learning_rate": 4.589702333065165e-06, + "loss": 1.0798, + "step": 255 + }, + { + "epoch": 0.3704775687409551, + "grad_norm": 1.6285136938095093, + "learning_rate": 4.585679806918745e-06, + "loss": 1.0495, + "step": 256 + }, + { + "epoch": 0.3719247467438495, + "grad_norm": 1.2884211540222168, + "learning_rate": 4.581657280772325e-06, + "loss": 1.0623, + "step": 257 + }, + { + "epoch": 0.37337192474674386, + "grad_norm": 1.3367090225219727, + "learning_rate": 4.577634754625905e-06, + "loss": 1.0758, + "step": 258 + }, + { + "epoch": 0.3748191027496382, + "grad_norm": 1.4230573177337646, + "learning_rate": 4.5736122284794855e-06, + "loss": 1.0711, + "step": 259 + }, + { + "epoch": 0.37626628075253254, + "grad_norm": 1.2134445905685425, + "learning_rate": 4.569589702333066e-06, + "loss": 1.0818, + "step": 260 + }, + { + "epoch": 0.37626628075253254, + "eval_loss": 1.1419272422790527, + "eval_runtime": 23.7533, + "eval_samples_per_second": 42.099, + "eval_steps_per_second": 2.652, + "step": 260 + }, + { + "epoch": 0.37771345875542695, + "grad_norm": 1.6413689851760864, + "learning_rate": 4.565567176186646e-06, + "loss": 1.0309, + "step": 261 + }, + { + "epoch": 0.3791606367583213, + "grad_norm": 1.4252878427505493, + "learning_rate": 4.561544650040226e-06, + "loss": 1.0641, + "step": 262 + }, + { + "epoch": 0.38060781476121563, + "grad_norm": 1.4460115432739258, + "learning_rate": 4.557522123893805e-06, + "loss": 1.0797, + "step": 263 + }, + { + "epoch": 0.38205499276411, + "grad_norm": 1.4155033826828003, + "learning_rate": 4.553499597747385e-06, + "loss": 1.0965, + "step": 264 + }, + { + "epoch": 0.3835021707670043, + "grad_norm": 1.3723843097686768, + "learning_rate": 4.549477071600965e-06, + "loss": 1.0435, + "step": 265 + }, + { + "epoch": 0.3849493487698987, + "grad_norm": 1.3257044553756714, + "learning_rate": 4.5454545454545455e-06, + "loss": 1.0598, + "step": 266 + }, + { + "epoch": 0.38639652677279307, + "grad_norm": 1.318723440170288, + "learning_rate": 4.541432019308126e-06, + "loss": 1.0608, + "step": 267 + }, + { + "epoch": 0.3878437047756874, + "grad_norm": 1.3927741050720215, + "learning_rate": 4.537409493161706e-06, + "loss": 1.0835, + "step": 268 + }, + { + "epoch": 0.38929088277858176, + "grad_norm": 1.3704252243041992, + "learning_rate": 4.533386967015286e-06, + "loss": 1.0536, + "step": 269 + }, + { + "epoch": 0.3907380607814761, + "grad_norm": 1.3910446166992188, + "learning_rate": 4.529364440868866e-06, + "loss": 1.0608, + "step": 270 + }, + { + "epoch": 0.3907380607814761, + "eval_loss": 1.1380563974380493, + "eval_runtime": 23.7104, + "eval_samples_per_second": 42.176, + "eval_steps_per_second": 2.657, + "step": 270 + }, + { + "epoch": 0.3921852387843705, + "grad_norm": 1.3025933504104614, + "learning_rate": 4.525341914722446e-06, + "loss": 1.0128, + "step": 271 + }, + { + "epoch": 0.39363241678726485, + "grad_norm": 1.362486481666565, + "learning_rate": 4.521319388576026e-06, + "loss": 1.1214, + "step": 272 + }, + { + "epoch": 0.3950795947901592, + "grad_norm": 1.3051025867462158, + "learning_rate": 4.5172968624296056e-06, + "loss": 1.0252, + "step": 273 + }, + { + "epoch": 0.39652677279305354, + "grad_norm": 1.4714802503585815, + "learning_rate": 4.513274336283186e-06, + "loss": 1.0622, + "step": 274 + }, + { + "epoch": 0.3979739507959479, + "grad_norm": 1.1817030906677246, + "learning_rate": 4.509251810136766e-06, + "loss": 1.0477, + "step": 275 + }, + { + "epoch": 0.3994211287988423, + "grad_norm": 1.2782890796661377, + "learning_rate": 4.505229283990346e-06, + "loss": 1.0654, + "step": 276 + }, + { + "epoch": 0.40086830680173663, + "grad_norm": 1.3380616903305054, + "learning_rate": 4.501206757843926e-06, + "loss": 1.0604, + "step": 277 + }, + { + "epoch": 0.402315484804631, + "grad_norm": 1.3800311088562012, + "learning_rate": 4.497184231697506e-06, + "loss": 1.0417, + "step": 278 + }, + { + "epoch": 0.4037626628075253, + "grad_norm": 1.208494782447815, + "learning_rate": 4.493161705551086e-06, + "loss": 1.0474, + "step": 279 + }, + { + "epoch": 0.40520984081041966, + "grad_norm": 1.2973201274871826, + "learning_rate": 4.4891391794046665e-06, + "loss": 1.0522, + "step": 280 + }, + { + "epoch": 0.40520984081041966, + "eval_loss": 1.1327248811721802, + "eval_runtime": 23.6965, + "eval_samples_per_second": 42.2, + "eval_steps_per_second": 2.659, + "step": 280 + }, + { + "epoch": 0.40665701881331406, + "grad_norm": 1.3032149076461792, + "learning_rate": 4.485116653258247e-06, + "loss": 1.0731, + "step": 281 + }, + { + "epoch": 0.4081041968162084, + "grad_norm": 1.3234710693359375, + "learning_rate": 4.481094127111827e-06, + "loss": 1.0935, + "step": 282 + }, + { + "epoch": 0.40955137481910275, + "grad_norm": 1.3118281364440918, + "learning_rate": 4.477071600965406e-06, + "loss": 1.0291, + "step": 283 + }, + { + "epoch": 0.4109985528219971, + "grad_norm": 1.3796930313110352, + "learning_rate": 4.473049074818986e-06, + "loss": 1.0659, + "step": 284 + }, + { + "epoch": 0.41244573082489144, + "grad_norm": 1.2467941045761108, + "learning_rate": 4.469026548672566e-06, + "loss": 1.0711, + "step": 285 + }, + { + "epoch": 0.41389290882778584, + "grad_norm": 1.333546757698059, + "learning_rate": 4.465004022526146e-06, + "loss": 1.0308, + "step": 286 + }, + { + "epoch": 0.4153400868306802, + "grad_norm": 1.3348644971847534, + "learning_rate": 4.460981496379727e-06, + "loss": 1.0375, + "step": 287 + }, + { + "epoch": 0.41678726483357453, + "grad_norm": 1.3992811441421509, + "learning_rate": 4.4569589702333075e-06, + "loss": 1.0439, + "step": 288 + }, + { + "epoch": 0.4182344428364689, + "grad_norm": 1.4405189752578735, + "learning_rate": 4.452936444086888e-06, + "loss": 1.0729, + "step": 289 + }, + { + "epoch": 0.4196816208393632, + "grad_norm": 1.3112566471099854, + "learning_rate": 4.448913917940467e-06, + "loss": 1.0788, + "step": 290 + }, + { + "epoch": 0.4196816208393632, + "eval_loss": 1.1283701658248901, + "eval_runtime": 23.6496, + "eval_samples_per_second": 42.284, + "eval_steps_per_second": 2.664, + "step": 290 + }, + { + "epoch": 0.4211287988422576, + "grad_norm": 1.3317224979400635, + "learning_rate": 4.444891391794047e-06, + "loss": 1.0429, + "step": 291 + }, + { + "epoch": 0.42257597684515197, + "grad_norm": 1.2637133598327637, + "learning_rate": 4.440868865647627e-06, + "loss": 1.0615, + "step": 292 + }, + { + "epoch": 0.4240231548480463, + "grad_norm": 1.3004966974258423, + "learning_rate": 4.436846339501207e-06, + "loss": 1.0474, + "step": 293 + }, + { + "epoch": 0.42547033285094066, + "grad_norm": 1.2178356647491455, + "learning_rate": 4.432823813354787e-06, + "loss": 1.0669, + "step": 294 + }, + { + "epoch": 0.426917510853835, + "grad_norm": 1.2395060062408447, + "learning_rate": 4.4288012872083675e-06, + "loss": 1.0249, + "step": 295 + }, + { + "epoch": 0.4283646888567294, + "grad_norm": 1.3513914346694946, + "learning_rate": 4.424778761061948e-06, + "loss": 1.0406, + "step": 296 + }, + { + "epoch": 0.42981186685962375, + "grad_norm": 1.293668270111084, + "learning_rate": 4.420756234915528e-06, + "loss": 1.0173, + "step": 297 + }, + { + "epoch": 0.4312590448625181, + "grad_norm": 1.2693403959274292, + "learning_rate": 4.416733708769108e-06, + "loss": 1.0326, + "step": 298 + }, + { + "epoch": 0.43270622286541244, + "grad_norm": 1.1913676261901855, + "learning_rate": 4.412711182622687e-06, + "loss": 1.074, + "step": 299 + }, + { + "epoch": 0.4341534008683068, + "grad_norm": 1.3073598146438599, + "learning_rate": 4.408688656476267e-06, + "loss": 1.0077, + "step": 300 + }, + { + "epoch": 0.4341534008683068, + "eval_loss": 1.1243594884872437, + "eval_runtime": 23.722, + "eval_samples_per_second": 42.155, + "eval_steps_per_second": 2.656, + "step": 300 + }, + { + "epoch": 0.4356005788712012, + "grad_norm": 1.2678052186965942, + "learning_rate": 4.4046661303298474e-06, + "loss": 1.0109, + "step": 301 + }, + { + "epoch": 0.4370477568740955, + "grad_norm": 1.4674625396728516, + "learning_rate": 4.4006436041834276e-06, + "loss": 1.0788, + "step": 302 + }, + { + "epoch": 0.4384949348769899, + "grad_norm": 1.2991254329681396, + "learning_rate": 4.396621078037008e-06, + "loss": 1.0235, + "step": 303 + }, + { + "epoch": 0.4399421128798842, + "grad_norm": 1.279126763343811, + "learning_rate": 4.392598551890588e-06, + "loss": 1.0419, + "step": 304 + }, + { + "epoch": 0.44138929088277856, + "grad_norm": 1.360550880432129, + "learning_rate": 4.388576025744168e-06, + "loss": 1.0265, + "step": 305 + }, + { + "epoch": 0.44283646888567296, + "grad_norm": 1.2458831071853638, + "learning_rate": 4.384553499597748e-06, + "loss": 1.0342, + "step": 306 + }, + { + "epoch": 0.4442836468885673, + "grad_norm": 1.312846064567566, + "learning_rate": 4.380530973451328e-06, + "loss": 1.0251, + "step": 307 + }, + { + "epoch": 0.44573082489146165, + "grad_norm": 1.3481419086456299, + "learning_rate": 4.376508447304908e-06, + "loss": 1.1012, + "step": 308 + }, + { + "epoch": 0.447178002894356, + "grad_norm": 1.4747331142425537, + "learning_rate": 4.372485921158488e-06, + "loss": 1.0298, + "step": 309 + }, + { + "epoch": 0.44862518089725034, + "grad_norm": 1.503267526626587, + "learning_rate": 4.368463395012068e-06, + "loss": 1.0244, + "step": 310 + }, + { + "epoch": 0.44862518089725034, + "eval_loss": 1.1231423616409302, + "eval_runtime": 23.7225, + "eval_samples_per_second": 42.154, + "eval_steps_per_second": 2.656, + "step": 310 + }, + { + "epoch": 0.45007235890014474, + "grad_norm": 1.35536789894104, + "learning_rate": 4.364440868865648e-06, + "loss": 1.0699, + "step": 311 + }, + { + "epoch": 0.4515195369030391, + "grad_norm": 1.411491870880127, + "learning_rate": 4.360418342719228e-06, + "loss": 1.0061, + "step": 312 + }, + { + "epoch": 0.45296671490593343, + "grad_norm": 1.3363436460494995, + "learning_rate": 4.356395816572808e-06, + "loss": 1.0467, + "step": 313 + }, + { + "epoch": 0.4544138929088278, + "grad_norm": 1.2964757680892944, + "learning_rate": 4.352373290426388e-06, + "loss": 1.0057, + "step": 314 + }, + { + "epoch": 0.4558610709117221, + "grad_norm": 1.3831514120101929, + "learning_rate": 4.348350764279968e-06, + "loss": 1.0622, + "step": 315 + }, + { + "epoch": 0.4573082489146165, + "grad_norm": 1.2829939126968384, + "learning_rate": 4.3443282381335485e-06, + "loss": 1.0031, + "step": 316 + }, + { + "epoch": 0.45875542691751087, + "grad_norm": 1.251723051071167, + "learning_rate": 4.340305711987129e-06, + "loss": 1.0979, + "step": 317 + }, + { + "epoch": 0.4602026049204052, + "grad_norm": 1.255389928817749, + "learning_rate": 4.336283185840709e-06, + "loss": 1.0117, + "step": 318 + }, + { + "epoch": 0.46164978292329956, + "grad_norm": 1.3971837759017944, + "learning_rate": 4.332260659694288e-06, + "loss": 1.0512, + "step": 319 + }, + { + "epoch": 0.4630969609261939, + "grad_norm": 1.5119656324386597, + "learning_rate": 4.328238133547868e-06, + "loss": 1.0497, + "step": 320 + }, + { + "epoch": 0.4630969609261939, + "eval_loss": 1.1185903549194336, + "eval_runtime": 23.7122, + "eval_samples_per_second": 42.172, + "eval_steps_per_second": 2.657, + "step": 320 + }, + { + "epoch": 0.4645441389290883, + "grad_norm": 1.3516604900360107, + "learning_rate": 4.324215607401448e-06, + "loss": 1.0458, + "step": 321 + }, + { + "epoch": 0.46599131693198265, + "grad_norm": 1.3941069841384888, + "learning_rate": 4.3201930812550284e-06, + "loss": 1.0259, + "step": 322 + }, + { + "epoch": 0.467438494934877, + "grad_norm": 1.4691979885101318, + "learning_rate": 4.3161705551086086e-06, + "loss": 1.0597, + "step": 323 + }, + { + "epoch": 0.46888567293777134, + "grad_norm": 1.3884938955307007, + "learning_rate": 4.312148028962189e-06, + "loss": 1.0431, + "step": 324 + }, + { + "epoch": 0.4703328509406657, + "grad_norm": 1.3524333238601685, + "learning_rate": 4.308125502815769e-06, + "loss": 1.0322, + "step": 325 + }, + { + "epoch": 0.4717800289435601, + "grad_norm": 1.6527141332626343, + "learning_rate": 4.304102976669349e-06, + "loss": 1.0319, + "step": 326 + }, + { + "epoch": 0.4732272069464544, + "grad_norm": 1.3735195398330688, + "learning_rate": 4.300080450522929e-06, + "loss": 1.019, + "step": 327 + }, + { + "epoch": 0.47467438494934877, + "grad_norm": 1.5720343589782715, + "learning_rate": 4.296057924376509e-06, + "loss": 1.0944, + "step": 328 + }, + { + "epoch": 0.4761215629522431, + "grad_norm": 1.3800513744354248, + "learning_rate": 4.2920353982300885e-06, + "loss": 1.0586, + "step": 329 + }, + { + "epoch": 0.47756874095513746, + "grad_norm": 1.2874724864959717, + "learning_rate": 4.288012872083669e-06, + "loss": 1.0475, + "step": 330 + }, + { + "epoch": 0.47756874095513746, + "eval_loss": 1.1196138858795166, + "eval_runtime": 23.6913, + "eval_samples_per_second": 42.21, + "eval_steps_per_second": 2.659, + "step": 330 + }, + { + "epoch": 0.47901591895803186, + "grad_norm": 1.5478601455688477, + "learning_rate": 4.283990345937249e-06, + "loss": 1.0005, + "step": 331 + }, + { + "epoch": 0.4804630969609262, + "grad_norm": 1.365601658821106, + "learning_rate": 4.279967819790829e-06, + "loss": 1.0464, + "step": 332 + }, + { + "epoch": 0.48191027496382055, + "grad_norm": 1.469828724861145, + "learning_rate": 4.275945293644409e-06, + "loss": 1.0473, + "step": 333 + }, + { + "epoch": 0.4833574529667149, + "grad_norm": 1.4974852800369263, + "learning_rate": 4.271922767497989e-06, + "loss": 1.0525, + "step": 334 + }, + { + "epoch": 0.48480463096960924, + "grad_norm": 1.353641390800476, + "learning_rate": 4.267900241351569e-06, + "loss": 1.023, + "step": 335 + }, + { + "epoch": 0.48625180897250364, + "grad_norm": 1.4370217323303223, + "learning_rate": 4.263877715205149e-06, + "loss": 1.033, + "step": 336 + }, + { + "epoch": 0.487698986975398, + "grad_norm": 1.4377245903015137, + "learning_rate": 4.2598551890587295e-06, + "loss": 1.0366, + "step": 337 + }, + { + "epoch": 0.48914616497829233, + "grad_norm": 1.3811371326446533, + "learning_rate": 4.25583266291231e-06, + "loss": 1.0404, + "step": 338 + }, + { + "epoch": 0.4905933429811867, + "grad_norm": 1.3572853803634644, + "learning_rate": 4.251810136765889e-06, + "loss": 1.028, + "step": 339 + }, + { + "epoch": 0.492040520984081, + "grad_norm": 1.339189887046814, + "learning_rate": 4.247787610619469e-06, + "loss": 1.0054, + "step": 340 + }, + { + "epoch": 0.492040520984081, + "eval_loss": 1.1196181774139404, + "eval_runtime": 23.7607, + "eval_samples_per_second": 42.086, + "eval_steps_per_second": 2.651, + "step": 340 + }, + { + "epoch": 0.4934876989869754, + "grad_norm": 1.3125951290130615, + "learning_rate": 4.243765084473049e-06, + "loss": 1.0476, + "step": 341 + }, + { + "epoch": 0.49493487698986977, + "grad_norm": 1.3548215627670288, + "learning_rate": 4.239742558326629e-06, + "loss": 1.051, + "step": 342 + }, + { + "epoch": 0.4963820549927641, + "grad_norm": 1.253381371498108, + "learning_rate": 4.235720032180209e-06, + "loss": 0.9838, + "step": 343 + }, + { + "epoch": 0.49782923299565845, + "grad_norm": 1.4013867378234863, + "learning_rate": 4.2316975060337895e-06, + "loss": 1.0353, + "step": 344 + }, + { + "epoch": 0.4992764109985528, + "grad_norm": 1.3587759733200073, + "learning_rate": 4.22767497988737e-06, + "loss": 1.0328, + "step": 345 + }, + { + "epoch": 0.5007235890014472, + "grad_norm": 1.2683507204055786, + "learning_rate": 4.22365245374095e-06, + "loss": 1.0222, + "step": 346 + }, + { + "epoch": 0.5021707670043415, + "grad_norm": 1.230000376701355, + "learning_rate": 4.21962992759453e-06, + "loss": 0.999, + "step": 347 + }, + { + "epoch": 0.5036179450072359, + "grad_norm": 1.3020412921905518, + "learning_rate": 4.21560740144811e-06, + "loss": 1.0347, + "step": 348 + }, + { + "epoch": 0.5050651230101303, + "grad_norm": 1.4678325653076172, + "learning_rate": 4.211584875301689e-06, + "loss": 1.0166, + "step": 349 + }, + { + "epoch": 0.5065123010130246, + "grad_norm": 1.5064916610717773, + "learning_rate": 4.2075623491552695e-06, + "loss": 1.01, + "step": 350 + }, + { + "epoch": 0.5065123010130246, + "eval_loss": 1.1162378787994385, + "eval_runtime": 23.7517, + "eval_samples_per_second": 42.102, + "eval_steps_per_second": 2.652, + "step": 350 + }, + { + "epoch": 0.507959479015919, + "grad_norm": 1.315529227256775, + "learning_rate": 4.20353982300885e-06, + "loss": 1.0424, + "step": 351 + }, + { + "epoch": 0.5094066570188133, + "grad_norm": 1.2978706359863281, + "learning_rate": 4.19951729686243e-06, + "loss": 1.0561, + "step": 352 + }, + { + "epoch": 0.5108538350217077, + "grad_norm": 1.3224369287490845, + "learning_rate": 4.19549477071601e-06, + "loss": 1.0216, + "step": 353 + }, + { + "epoch": 0.5123010130246021, + "grad_norm": 1.478445291519165, + "learning_rate": 4.19147224456959e-06, + "loss": 1.0588, + "step": 354 + }, + { + "epoch": 0.5137481910274964, + "grad_norm": 1.3233546018600464, + "learning_rate": 4.18744971842317e-06, + "loss": 0.9911, + "step": 355 + }, + { + "epoch": 0.5151953690303908, + "grad_norm": 1.5392651557922363, + "learning_rate": 4.18342719227675e-06, + "loss": 1.081, + "step": 356 + }, + { + "epoch": 0.516642547033285, + "grad_norm": 1.3318156003952026, + "learning_rate": 4.17940466613033e-06, + "loss": 1.0303, + "step": 357 + }, + { + "epoch": 0.5180897250361794, + "grad_norm": 1.2767019271850586, + "learning_rate": 4.1753821399839105e-06, + "loss": 1.0231, + "step": 358 + }, + { + "epoch": 0.5195369030390738, + "grad_norm": 1.3169499635696411, + "learning_rate": 4.17135961383749e-06, + "loss": 1.0535, + "step": 359 + }, + { + "epoch": 0.5209840810419681, + "grad_norm": 1.4249968528747559, + "learning_rate": 4.16733708769107e-06, + "loss": 1.0097, + "step": 360 + }, + { + "epoch": 0.5209840810419681, + "eval_loss": 1.1125266551971436, + "eval_runtime": 23.7106, + "eval_samples_per_second": 42.175, + "eval_steps_per_second": 2.657, + "step": 360 + }, + { + "epoch": 0.5224312590448625, + "grad_norm": 1.2392650842666626, + "learning_rate": 4.16331456154465e-06, + "loss": 1.053, + "step": 361 + }, + { + "epoch": 0.5238784370477568, + "grad_norm": 1.5001884698867798, + "learning_rate": 4.15929203539823e-06, + "loss": 1.018, + "step": 362 + }, + { + "epoch": 0.5253256150506512, + "grad_norm": 1.2545444965362549, + "learning_rate": 4.15526950925181e-06, + "loss": 1.0161, + "step": 363 + }, + { + "epoch": 0.5267727930535456, + "grad_norm": 1.239485502243042, + "learning_rate": 4.15124698310539e-06, + "loss": 1.0225, + "step": 364 + }, + { + "epoch": 0.5282199710564399, + "grad_norm": 1.4715404510498047, + "learning_rate": 4.1472244569589705e-06, + "loss": 1.0289, + "step": 365 + }, + { + "epoch": 0.5296671490593343, + "grad_norm": 1.4323079586029053, + "learning_rate": 4.143201930812551e-06, + "loss": 1.0157, + "step": 366 + }, + { + "epoch": 0.5311143270622286, + "grad_norm": 1.3055295944213867, + "learning_rate": 4.139179404666131e-06, + "loss": 0.9963, + "step": 367 + }, + { + "epoch": 0.532561505065123, + "grad_norm": 1.2465908527374268, + "learning_rate": 4.13515687851971e-06, + "loss": 1.0268, + "step": 368 + }, + { + "epoch": 0.5340086830680174, + "grad_norm": 1.3686151504516602, + "learning_rate": 4.13113435237329e-06, + "loss": 1.0203, + "step": 369 + }, + { + "epoch": 0.5354558610709117, + "grad_norm": 1.3973885774612427, + "learning_rate": 4.12711182622687e-06, + "loss": 1.0165, + "step": 370 + }, + { + "epoch": 0.5354558610709117, + "eval_loss": 1.111108660697937, + "eval_runtime": 23.8147, + "eval_samples_per_second": 41.991, + "eval_steps_per_second": 2.645, + "step": 370 + }, + { + "epoch": 0.5369030390738061, + "grad_norm": 1.2840689420700073, + "learning_rate": 4.1230893000804505e-06, + "loss": 1.0112, + "step": 371 + }, + { + "epoch": 0.5383502170767004, + "grad_norm": 1.3311495780944824, + "learning_rate": 4.119066773934031e-06, + "loss": 0.9826, + "step": 372 + }, + { + "epoch": 0.5397973950795948, + "grad_norm": 1.4287539720535278, + "learning_rate": 4.115044247787611e-06, + "loss": 1.0106, + "step": 373 + }, + { + "epoch": 0.5412445730824892, + "grad_norm": 1.5754257440567017, + "learning_rate": 4.111021721641191e-06, + "loss": 1.0271, + "step": 374 + }, + { + "epoch": 0.5426917510853835, + "grad_norm": 1.405771017074585, + "learning_rate": 4.106999195494771e-06, + "loss": 1.0256, + "step": 375 + }, + { + "epoch": 0.5441389290882779, + "grad_norm": 1.3505287170410156, + "learning_rate": 4.102976669348351e-06, + "loss": 0.9971, + "step": 376 + }, + { + "epoch": 0.5455861070911722, + "grad_norm": 1.2933286428451538, + "learning_rate": 4.098954143201931e-06, + "loss": 0.987, + "step": 377 + }, + { + "epoch": 0.5470332850940666, + "grad_norm": 1.2919787168502808, + "learning_rate": 4.094931617055511e-06, + "loss": 1.0097, + "step": 378 + }, + { + "epoch": 0.548480463096961, + "grad_norm": 1.4471627473831177, + "learning_rate": 4.0909090909090915e-06, + "loss": 1.0347, + "step": 379 + }, + { + "epoch": 0.5499276410998553, + "grad_norm": 1.2930383682250977, + "learning_rate": 4.086886564762672e-06, + "loss": 1.0581, + "step": 380 + }, + { + "epoch": 0.5499276410998553, + "eval_loss": 1.104731798171997, + "eval_runtime": 23.6928, + "eval_samples_per_second": 42.207, + "eval_steps_per_second": 2.659, + "step": 380 + }, + { + "epoch": 0.5513748191027497, + "grad_norm": 1.353825330734253, + "learning_rate": 4.082864038616252e-06, + "loss": 0.9836, + "step": 381 + }, + { + "epoch": 0.552821997105644, + "grad_norm": 1.4356493949890137, + "learning_rate": 4.078841512469832e-06, + "loss": 1.0216, + "step": 382 + }, + { + "epoch": 0.5542691751085383, + "grad_norm": 1.3893400430679321, + "learning_rate": 4.074818986323412e-06, + "loss": 1.0226, + "step": 383 + }, + { + "epoch": 0.5557163531114327, + "grad_norm": 1.4296265840530396, + "learning_rate": 4.070796460176992e-06, + "loss": 1.0536, + "step": 384 + }, + { + "epoch": 0.557163531114327, + "grad_norm": 1.3603262901306152, + "learning_rate": 4.066773934030571e-06, + "loss": 1.0573, + "step": 385 + }, + { + "epoch": 0.5586107091172214, + "grad_norm": 1.3417487144470215, + "learning_rate": 4.0627514078841515e-06, + "loss": 1.0415, + "step": 386 + }, + { + "epoch": 0.5600578871201157, + "grad_norm": 1.419804334640503, + "learning_rate": 4.058728881737732e-06, + "loss": 0.9755, + "step": 387 + }, + { + "epoch": 0.5615050651230101, + "grad_norm": 1.3322898149490356, + "learning_rate": 4.054706355591312e-06, + "loss": 0.9733, + "step": 388 + }, + { + "epoch": 0.5629522431259045, + "grad_norm": 1.2970482110977173, + "learning_rate": 4.050683829444892e-06, + "loss": 1.0228, + "step": 389 + }, + { + "epoch": 0.5643994211287988, + "grad_norm": 1.412344217300415, + "learning_rate": 4.046661303298472e-06, + "loss": 1.0154, + "step": 390 + }, + { + "epoch": 0.5643994211287988, + "eval_loss": 1.1061700582504272, + "eval_runtime": 23.7292, + "eval_samples_per_second": 42.142, + "eval_steps_per_second": 2.655, + "step": 390 + }, + { + "epoch": 0.5658465991316932, + "grad_norm": 1.2040972709655762, + "learning_rate": 4.042638777152052e-06, + "loss": 1.011, + "step": 391 + }, + { + "epoch": 0.5672937771345875, + "grad_norm": 1.4383659362792969, + "learning_rate": 4.038616251005632e-06, + "loss": 1.0395, + "step": 392 + }, + { + "epoch": 0.5687409551374819, + "grad_norm": 1.30646812915802, + "learning_rate": 4.034593724859212e-06, + "loss": 1.0294, + "step": 393 + }, + { + "epoch": 0.5701881331403763, + "grad_norm": 1.2324343919754028, + "learning_rate": 4.0305711987127925e-06, + "loss": 1.0091, + "step": 394 + }, + { + "epoch": 0.5716353111432706, + "grad_norm": 1.3447880744934082, + "learning_rate": 4.026548672566372e-06, + "loss": 1.0257, + "step": 395 + }, + { + "epoch": 0.573082489146165, + "grad_norm": 1.4558801651000977, + "learning_rate": 4.022526146419952e-06, + "loss": 1.0362, + "step": 396 + }, + { + "epoch": 0.5745296671490593, + "grad_norm": 1.2694238424301147, + "learning_rate": 4.018503620273532e-06, + "loss": 1.0177, + "step": 397 + }, + { + "epoch": 0.5759768451519537, + "grad_norm": 1.4371013641357422, + "learning_rate": 4.014481094127112e-06, + "loss": 1.0371, + "step": 398 + }, + { + "epoch": 0.5774240231548481, + "grad_norm": 1.4840346574783325, + "learning_rate": 4.010458567980692e-06, + "loss": 1.022, + "step": 399 + }, + { + "epoch": 0.5788712011577424, + "grad_norm": 1.3151086568832397, + "learning_rate": 4.0064360418342725e-06, + "loss": 1.0224, + "step": 400 + }, + { + "epoch": 0.5788712011577424, + "eval_loss": 1.1010886430740356, + "eval_runtime": 23.6832, + "eval_samples_per_second": 42.224, + "eval_steps_per_second": 2.66, + "step": 400 + }, + { + "epoch": 0.5803183791606368, + "grad_norm": 1.3068777322769165, + "learning_rate": 4.002413515687853e-06, + "loss": 1.0169, + "step": 401 + }, + { + "epoch": 0.5817655571635311, + "grad_norm": 1.2886050939559937, + "learning_rate": 3.998390989541433e-06, + "loss": 1.0085, + "step": 402 + }, + { + "epoch": 0.5832127351664255, + "grad_norm": 1.2724084854125977, + "learning_rate": 3.994368463395013e-06, + "loss": 1.0057, + "step": 403 + }, + { + "epoch": 0.5846599131693199, + "grad_norm": 1.3457502126693726, + "learning_rate": 3.990345937248592e-06, + "loss": 1.0415, + "step": 404 + }, + { + "epoch": 0.5861070911722142, + "grad_norm": 1.393071174621582, + "learning_rate": 3.986323411102172e-06, + "loss": 1.0492, + "step": 405 + }, + { + "epoch": 0.5875542691751086, + "grad_norm": 1.2607946395874023, + "learning_rate": 3.982300884955752e-06, + "loss": 1.0476, + "step": 406 + }, + { + "epoch": 0.5890014471780028, + "grad_norm": 1.4300132989883423, + "learning_rate": 3.9782783588093325e-06, + "loss": 1.0275, + "step": 407 + }, + { + "epoch": 0.5904486251808972, + "grad_norm": 1.3288482427597046, + "learning_rate": 3.974255832662913e-06, + "loss": 0.9944, + "step": 408 + }, + { + "epoch": 0.5918958031837916, + "grad_norm": 1.247685194015503, + "learning_rate": 3.970233306516493e-06, + "loss": 0.9993, + "step": 409 + }, + { + "epoch": 0.5933429811866859, + "grad_norm": 1.397312045097351, + "learning_rate": 3.966210780370073e-06, + "loss": 0.9901, + "step": 410 + }, + { + "epoch": 0.5933429811866859, + "eval_loss": 1.0988625288009644, + "eval_runtime": 23.6837, + "eval_samples_per_second": 42.223, + "eval_steps_per_second": 2.66, + "step": 410 + }, + { + "epoch": 0.5947901591895803, + "grad_norm": 1.4144586324691772, + "learning_rate": 3.962188254223653e-06, + "loss": 1.0109, + "step": 411 + }, + { + "epoch": 0.5962373371924746, + "grad_norm": 1.3268319368362427, + "learning_rate": 3.958165728077233e-06, + "loss": 0.9977, + "step": 412 + }, + { + "epoch": 0.597684515195369, + "grad_norm": 1.4464187622070312, + "learning_rate": 3.954143201930813e-06, + "loss": 0.976, + "step": 413 + }, + { + "epoch": 0.5991316931982634, + "grad_norm": 1.433099627494812, + "learning_rate": 3.9501206757843926e-06, + "loss": 1.0358, + "step": 414 + }, + { + "epoch": 0.6005788712011577, + "grad_norm": 1.3310801982879639, + "learning_rate": 3.946098149637973e-06, + "loss": 1.0366, + "step": 415 + }, + { + "epoch": 0.6020260492040521, + "grad_norm": 1.3570735454559326, + "learning_rate": 3.942075623491553e-06, + "loss": 0.9888, + "step": 416 + }, + { + "epoch": 0.6034732272069464, + "grad_norm": 1.3415942192077637, + "learning_rate": 3.938053097345133e-06, + "loss": 0.9807, + "step": 417 + }, + { + "epoch": 0.6049204052098408, + "grad_norm": 1.2363113164901733, + "learning_rate": 3.934030571198713e-06, + "loss": 1.0436, + "step": 418 + }, + { + "epoch": 0.6063675832127352, + "grad_norm": 1.22818922996521, + "learning_rate": 3.930008045052293e-06, + "loss": 1.0109, + "step": 419 + }, + { + "epoch": 0.6078147612156295, + "grad_norm": 1.2622146606445312, + "learning_rate": 3.925985518905873e-06, + "loss": 0.9794, + "step": 420 + }, + { + "epoch": 0.6078147612156295, + "eval_loss": 1.0965070724487305, + "eval_runtime": 23.7473, + "eval_samples_per_second": 42.11, + "eval_steps_per_second": 2.653, + "step": 420 + }, + { + "epoch": 0.6092619392185239, + "grad_norm": 1.3269673585891724, + "learning_rate": 3.9219629927594534e-06, + "loss": 0.9854, + "step": 421 + }, + { + "epoch": 0.6107091172214182, + "grad_norm": 1.4200475215911865, + "learning_rate": 3.9179404666130336e-06, + "loss": 1.0236, + "step": 422 + }, + { + "epoch": 0.6121562952243126, + "grad_norm": 1.3433822393417358, + "learning_rate": 3.913917940466614e-06, + "loss": 0.9918, + "step": 423 + }, + { + "epoch": 0.613603473227207, + "grad_norm": 1.3996856212615967, + "learning_rate": 3.909895414320193e-06, + "loss": 1.0573, + "step": 424 + }, + { + "epoch": 0.6150506512301013, + "grad_norm": 1.3051807880401611, + "learning_rate": 3.905872888173773e-06, + "loss": 1.0188, + "step": 425 + }, + { + "epoch": 0.6164978292329957, + "grad_norm": 1.4123880863189697, + "learning_rate": 3.901850362027353e-06, + "loss": 0.9988, + "step": 426 + }, + { + "epoch": 0.61794500723589, + "grad_norm": 1.2546141147613525, + "learning_rate": 3.897827835880933e-06, + "loss": 0.9938, + "step": 427 + }, + { + "epoch": 0.6193921852387844, + "grad_norm": 1.2290722131729126, + "learning_rate": 3.8938053097345135e-06, + "loss": 0.9789, + "step": 428 + }, + { + "epoch": 0.6208393632416788, + "grad_norm": 1.3346364498138428, + "learning_rate": 3.889782783588094e-06, + "loss": 0.9953, + "step": 429 + }, + { + "epoch": 0.622286541244573, + "grad_norm": 1.4257797002792358, + "learning_rate": 3.885760257441674e-06, + "loss": 1.0003, + "step": 430 + }, + { + "epoch": 0.622286541244573, + "eval_loss": 1.0964648723602295, + "eval_runtime": 23.7372, + "eval_samples_per_second": 42.128, + "eval_steps_per_second": 2.654, + "step": 430 + }, + { + "epoch": 0.6237337192474675, + "grad_norm": 1.4270843267440796, + "learning_rate": 3.881737731295254e-06, + "loss": 0.9934, + "step": 431 + }, + { + "epoch": 0.6251808972503617, + "grad_norm": 1.2762629985809326, + "learning_rate": 3.877715205148834e-06, + "loss": 1.0111, + "step": 432 + }, + { + "epoch": 0.6266280752532561, + "grad_norm": 1.3001309633255005, + "learning_rate": 3.873692679002414e-06, + "loss": 1.0154, + "step": 433 + }, + { + "epoch": 0.6280752532561505, + "grad_norm": 1.2802761793136597, + "learning_rate": 3.869670152855993e-06, + "loss": 1.0424, + "step": 434 + }, + { + "epoch": 0.6295224312590448, + "grad_norm": 1.246375322341919, + "learning_rate": 3.8656476267095735e-06, + "loss": 0.9759, + "step": 435 + }, + { + "epoch": 0.6309696092619392, + "grad_norm": 1.2724950313568115, + "learning_rate": 3.861625100563154e-06, + "loss": 0.9865, + "step": 436 + }, + { + "epoch": 0.6324167872648335, + "grad_norm": 1.3080137968063354, + "learning_rate": 3.857602574416734e-06, + "loss": 1.0344, + "step": 437 + }, + { + "epoch": 0.6338639652677279, + "grad_norm": 1.4367538690567017, + "learning_rate": 3.853580048270314e-06, + "loss": 1.0536, + "step": 438 + }, + { + "epoch": 0.6353111432706223, + "grad_norm": 1.3486604690551758, + "learning_rate": 3.849557522123894e-06, + "loss": 1.0176, + "step": 439 + }, + { + "epoch": 0.6367583212735166, + "grad_norm": 1.5524886846542358, + "learning_rate": 3.845534995977474e-06, + "loss": 0.9979, + "step": 440 + }, + { + "epoch": 0.6367583212735166, + "eval_loss": 1.0928927659988403, + "eval_runtime": 23.7331, + "eval_samples_per_second": 42.135, + "eval_steps_per_second": 2.655, + "step": 440 + }, + { + "epoch": 0.638205499276411, + "grad_norm": 1.336746335029602, + "learning_rate": 3.841512469831054e-06, + "loss": 0.977, + "step": 441 + }, + { + "epoch": 0.6396526772793053, + "grad_norm": 1.4037328958511353, + "learning_rate": 3.8374899436846344e-06, + "loss": 1.0166, + "step": 442 + }, + { + "epoch": 0.6410998552821997, + "grad_norm": 1.50180983543396, + "learning_rate": 3.8334674175382146e-06, + "loss": 1.0211, + "step": 443 + }, + { + "epoch": 0.6425470332850941, + "grad_norm": 1.3024051189422607, + "learning_rate": 3.829444891391794e-06, + "loss": 0.9843, + "step": 444 + }, + { + "epoch": 0.6439942112879884, + "grad_norm": 1.319004774093628, + "learning_rate": 3.825422365245374e-06, + "loss": 0.9853, + "step": 445 + }, + { + "epoch": 0.6454413892908828, + "grad_norm": 1.4267810583114624, + "learning_rate": 3.821399839098954e-06, + "loss": 0.9808, + "step": 446 + }, + { + "epoch": 0.6468885672937771, + "grad_norm": 1.24380624294281, + "learning_rate": 3.817377312952534e-06, + "loss": 1.0312, + "step": 447 + }, + { + "epoch": 0.6483357452966715, + "grad_norm": 1.264627456665039, + "learning_rate": 3.8133547868061144e-06, + "loss": 0.9928, + "step": 448 + }, + { + "epoch": 0.6497829232995659, + "grad_norm": 1.291927695274353, + "learning_rate": 3.8093322606596945e-06, + "loss": 1.0435, + "step": 449 + }, + { + "epoch": 0.6512301013024602, + "grad_norm": 1.419937014579773, + "learning_rate": 3.8053097345132746e-06, + "loss": 1.0246, + "step": 450 + }, + { + "epoch": 0.6512301013024602, + "eval_loss": 1.0930119752883911, + "eval_runtime": 23.7155, + "eval_samples_per_second": 42.166, + "eval_steps_per_second": 2.656, + "step": 450 + }, + { + "epoch": 0.6526772793053546, + "grad_norm": 1.4046635627746582, + "learning_rate": 3.8012872083668543e-06, + "loss": 0.9931, + "step": 451 + }, + { + "epoch": 0.6541244573082489, + "grad_norm": 1.3524024486541748, + "learning_rate": 3.7972646822204344e-06, + "loss": 0.9965, + "step": 452 + }, + { + "epoch": 0.6555716353111433, + "grad_norm": 1.2823618650436401, + "learning_rate": 3.7932421560740146e-06, + "loss": 0.9965, + "step": 453 + }, + { + "epoch": 0.6570188133140377, + "grad_norm": 1.2906886339187622, + "learning_rate": 3.7892196299275947e-06, + "loss": 0.9827, + "step": 454 + }, + { + "epoch": 0.658465991316932, + "grad_norm": 1.2599241733551025, + "learning_rate": 3.785197103781175e-06, + "loss": 0.9599, + "step": 455 + }, + { + "epoch": 0.6599131693198264, + "grad_norm": 1.265752911567688, + "learning_rate": 3.7811745776347545e-06, + "loss": 1.023, + "step": 456 + }, + { + "epoch": 0.6613603473227206, + "grad_norm": 1.3204699754714966, + "learning_rate": 3.7771520514883347e-06, + "loss": 1.0189, + "step": 457 + }, + { + "epoch": 0.662807525325615, + "grad_norm": 1.4138580560684204, + "learning_rate": 3.7731295253419148e-06, + "loss": 1.0059, + "step": 458 + }, + { + "epoch": 0.6642547033285094, + "grad_norm": 1.337053656578064, + "learning_rate": 3.769106999195495e-06, + "loss": 0.9889, + "step": 459 + }, + { + "epoch": 0.6657018813314037, + "grad_norm": 1.3081594705581665, + "learning_rate": 3.765084473049075e-06, + "loss": 1.0065, + "step": 460 + }, + { + "epoch": 0.6657018813314037, + "eval_loss": 1.0952460765838623, + "eval_runtime": 23.6688, + "eval_samples_per_second": 42.25, + "eval_steps_per_second": 2.662, + "step": 460 + }, + { + "epoch": 0.6671490593342981, + "grad_norm": 1.3889070749282837, + "learning_rate": 3.7610619469026547e-06, + "loss": 1.0256, + "step": 461 + }, + { + "epoch": 0.6685962373371924, + "grad_norm": 1.3292567729949951, + "learning_rate": 3.757039420756235e-06, + "loss": 1.0369, + "step": 462 + }, + { + "epoch": 0.6700434153400868, + "grad_norm": 1.2322574853897095, + "learning_rate": 3.753016894609815e-06, + "loss": 0.9957, + "step": 463 + }, + { + "epoch": 0.6714905933429812, + "grad_norm": 1.2834523916244507, + "learning_rate": 3.7489943684633956e-06, + "loss": 1.0276, + "step": 464 + }, + { + "epoch": 0.6729377713458755, + "grad_norm": 1.2598627805709839, + "learning_rate": 3.7449718423169757e-06, + "loss": 1.0332, + "step": 465 + }, + { + "epoch": 0.6743849493487699, + "grad_norm": 1.392765760421753, + "learning_rate": 3.740949316170556e-06, + "loss": 0.9829, + "step": 466 + }, + { + "epoch": 0.6758321273516642, + "grad_norm": 1.2173590660095215, + "learning_rate": 3.7369267900241355e-06, + "loss": 0.9855, + "step": 467 + }, + { + "epoch": 0.6772793053545586, + "grad_norm": 1.3330848217010498, + "learning_rate": 3.7329042638777156e-06, + "loss": 0.9955, + "step": 468 + }, + { + "epoch": 0.678726483357453, + "grad_norm": 1.3685492277145386, + "learning_rate": 3.7288817377312958e-06, + "loss": 0.9965, + "step": 469 + }, + { + "epoch": 0.6801736613603473, + "grad_norm": 1.390434980392456, + "learning_rate": 3.724859211584876e-06, + "loss": 1.0202, + "step": 470 + }, + { + "epoch": 0.6801736613603473, + "eval_loss": 1.0914441347122192, + "eval_runtime": 23.7358, + "eval_samples_per_second": 42.13, + "eval_steps_per_second": 2.654, + "step": 470 + }, + { + "epoch": 0.6816208393632417, + "grad_norm": 1.3303569555282593, + "learning_rate": 3.720836685438456e-06, + "loss": 1.0398, + "step": 471 + }, + { + "epoch": 0.683068017366136, + "grad_norm": 1.3635236024856567, + "learning_rate": 3.7168141592920357e-06, + "loss": 1.0061, + "step": 472 + }, + { + "epoch": 0.6845151953690304, + "grad_norm": 1.3613839149475098, + "learning_rate": 3.712791633145616e-06, + "loss": 0.9921, + "step": 473 + }, + { + "epoch": 0.6859623733719248, + "grad_norm": 1.3894354104995728, + "learning_rate": 3.708769106999196e-06, + "loss": 1.0231, + "step": 474 + }, + { + "epoch": 0.6874095513748191, + "grad_norm": 1.3147318363189697, + "learning_rate": 3.704746580852776e-06, + "loss": 1.012, + "step": 475 + }, + { + "epoch": 0.6888567293777135, + "grad_norm": 1.3795945644378662, + "learning_rate": 3.7007240547063562e-06, + "loss": 0.9855, + "step": 476 + }, + { + "epoch": 0.6903039073806078, + "grad_norm": 1.3321094512939453, + "learning_rate": 3.696701528559936e-06, + "loss": 1.0003, + "step": 477 + }, + { + "epoch": 0.6917510853835022, + "grad_norm": 1.3682198524475098, + "learning_rate": 3.692679002413516e-06, + "loss": 0.9732, + "step": 478 + }, + { + "epoch": 0.6931982633863966, + "grad_norm": 1.3183374404907227, + "learning_rate": 3.688656476267096e-06, + "loss": 0.9909, + "step": 479 + }, + { + "epoch": 0.6946454413892909, + "grad_norm": 1.4067922830581665, + "learning_rate": 3.6846339501206763e-06, + "loss": 1.0006, + "step": 480 + }, + { + "epoch": 0.6946454413892909, + "eval_loss": 1.0895153284072876, + "eval_runtime": 23.7535, + "eval_samples_per_second": 42.099, + "eval_steps_per_second": 2.652, + "step": 480 + }, + { + "epoch": 0.6960926193921853, + "grad_norm": 1.4125021696090698, + "learning_rate": 3.6806114239742565e-06, + "loss": 0.9415, + "step": 481 + }, + { + "epoch": 0.6975397973950795, + "grad_norm": 1.3365672826766968, + "learning_rate": 3.676588897827836e-06, + "loss": 0.989, + "step": 482 + }, + { + "epoch": 0.6989869753979739, + "grad_norm": 1.2873859405517578, + "learning_rate": 3.6725663716814163e-06, + "loss": 0.9809, + "step": 483 + }, + { + "epoch": 0.7004341534008683, + "grad_norm": 1.3345638513565063, + "learning_rate": 3.6685438455349964e-06, + "loss": 1.0138, + "step": 484 + }, + { + "epoch": 0.7018813314037626, + "grad_norm": 1.3752779960632324, + "learning_rate": 3.6645213193885765e-06, + "loss": 0.9907, + "step": 485 + }, + { + "epoch": 0.703328509406657, + "grad_norm": 1.3365281820297241, + "learning_rate": 3.6604987932421567e-06, + "loss": 1.0223, + "step": 486 + }, + { + "epoch": 0.7047756874095513, + "grad_norm": 1.4185869693756104, + "learning_rate": 3.6564762670957364e-06, + "loss": 0.9938, + "step": 487 + }, + { + "epoch": 0.7062228654124457, + "grad_norm": 1.3365386724472046, + "learning_rate": 3.6524537409493165e-06, + "loss": 0.9902, + "step": 488 + }, + { + "epoch": 0.7076700434153401, + "grad_norm": 1.3268109560012817, + "learning_rate": 3.6484312148028966e-06, + "loss": 0.9728, + "step": 489 + }, + { + "epoch": 0.7091172214182344, + "grad_norm": 1.3617050647735596, + "learning_rate": 3.6444086886564768e-06, + "loss": 1.0022, + "step": 490 + }, + { + "epoch": 0.7091172214182344, + "eval_loss": 1.0881752967834473, + "eval_runtime": 23.7409, + "eval_samples_per_second": 42.121, + "eval_steps_per_second": 2.654, + "step": 490 + }, + { + "epoch": 0.7105643994211288, + "grad_norm": 1.378687858581543, + "learning_rate": 3.640386162510057e-06, + "loss": 1.0182, + "step": 491 + }, + { + "epoch": 0.7120115774240231, + "grad_norm": 1.38932204246521, + "learning_rate": 3.6363636363636366e-06, + "loss": 1.0408, + "step": 492 + }, + { + "epoch": 0.7134587554269175, + "grad_norm": 1.357516884803772, + "learning_rate": 3.6323411102172167e-06, + "loss": 1.0177, + "step": 493 + }, + { + "epoch": 0.7149059334298119, + "grad_norm": 1.3478199243545532, + "learning_rate": 3.628318584070797e-06, + "loss": 0.9917, + "step": 494 + }, + { + "epoch": 0.7163531114327062, + "grad_norm": 1.3160595893859863, + "learning_rate": 3.624296057924377e-06, + "loss": 0.9848, + "step": 495 + }, + { + "epoch": 0.7178002894356006, + "grad_norm": 1.2829647064208984, + "learning_rate": 3.620273531777957e-06, + "loss": 0.9643, + "step": 496 + }, + { + "epoch": 0.7192474674384949, + "grad_norm": 1.2676979303359985, + "learning_rate": 3.616251005631537e-06, + "loss": 0.9646, + "step": 497 + }, + { + "epoch": 0.7206946454413893, + "grad_norm": 1.4029276371002197, + "learning_rate": 3.612228479485117e-06, + "loss": 0.972, + "step": 498 + }, + { + "epoch": 0.7221418234442837, + "grad_norm": 1.4197978973388672, + "learning_rate": 3.608205953338697e-06, + "loss": 1.0122, + "step": 499 + }, + { + "epoch": 0.723589001447178, + "grad_norm": 1.3601288795471191, + "learning_rate": 3.604183427192277e-06, + "loss": 0.99, + "step": 500 + }, + { + "epoch": 0.723589001447178, + "eval_loss": 1.0874083042144775, + "eval_runtime": 23.7551, + "eval_samples_per_second": 42.096, + "eval_steps_per_second": 2.652, + "step": 500 + }, + { + "epoch": 0.7250361794500724, + "grad_norm": 1.3303720951080322, + "learning_rate": 3.6001609010458573e-06, + "loss": 0.9654, + "step": 501 + }, + { + "epoch": 0.7264833574529667, + "grad_norm": 1.5043152570724487, + "learning_rate": 3.596138374899437e-06, + "loss": 1.0168, + "step": 502 + }, + { + "epoch": 0.7279305354558611, + "grad_norm": 1.2835030555725098, + "learning_rate": 3.592115848753017e-06, + "loss": 0.9918, + "step": 503 + }, + { + "epoch": 0.7293777134587555, + "grad_norm": 1.3037515878677368, + "learning_rate": 3.5880933226065973e-06, + "loss": 1.0031, + "step": 504 + }, + { + "epoch": 0.7308248914616498, + "grad_norm": 1.3722089529037476, + "learning_rate": 3.5840707964601774e-06, + "loss": 0.9937, + "step": 505 + }, + { + "epoch": 0.7322720694645442, + "grad_norm": 1.3899195194244385, + "learning_rate": 3.5800482703137575e-06, + "loss": 0.97, + "step": 506 + }, + { + "epoch": 0.7337192474674384, + "grad_norm": 1.3968757390975952, + "learning_rate": 3.5760257441673372e-06, + "loss": 0.9781, + "step": 507 + }, + { + "epoch": 0.7351664254703328, + "grad_norm": 1.508473515510559, + "learning_rate": 3.5720032180209174e-06, + "loss": 0.9825, + "step": 508 + }, + { + "epoch": 0.7366136034732272, + "grad_norm": 1.466068983078003, + "learning_rate": 3.5679806918744975e-06, + "loss": 0.9464, + "step": 509 + }, + { + "epoch": 0.7380607814761215, + "grad_norm": 1.5577962398529053, + "learning_rate": 3.5639581657280776e-06, + "loss": 1.0123, + "step": 510 + }, + { + "epoch": 0.7380607814761215, + "eval_loss": 1.0857983827590942, + "eval_runtime": 23.6758, + "eval_samples_per_second": 42.237, + "eval_steps_per_second": 2.661, + "step": 510 + }, + { + "epoch": 0.7395079594790159, + "grad_norm": 1.3283613920211792, + "learning_rate": 3.5599356395816577e-06, + "loss": 0.9975, + "step": 511 + }, + { + "epoch": 0.7409551374819102, + "grad_norm": 1.25275456905365, + "learning_rate": 3.5559131134352374e-06, + "loss": 0.9855, + "step": 512 + }, + { + "epoch": 0.7424023154848046, + "grad_norm": 1.4888328313827515, + "learning_rate": 3.5518905872888176e-06, + "loss": 1.0167, + "step": 513 + }, + { + "epoch": 0.743849493487699, + "grad_norm": 1.4865354299545288, + "learning_rate": 3.5478680611423977e-06, + "loss": 0.9861, + "step": 514 + }, + { + "epoch": 0.7452966714905933, + "grad_norm": 1.257825255393982, + "learning_rate": 3.543845534995978e-06, + "loss": 0.924, + "step": 515 + }, + { + "epoch": 0.7467438494934877, + "grad_norm": 1.4692885875701904, + "learning_rate": 3.539823008849558e-06, + "loss": 0.9804, + "step": 516 + }, + { + "epoch": 0.748191027496382, + "grad_norm": 1.3799200057983398, + "learning_rate": 3.5358004827031377e-06, + "loss": 1.0196, + "step": 517 + }, + { + "epoch": 0.7496382054992764, + "grad_norm": 1.4064457416534424, + "learning_rate": 3.5317779565567178e-06, + "loss": 1.0246, + "step": 518 + }, + { + "epoch": 0.7510853835021708, + "grad_norm": 1.324123740196228, + "learning_rate": 3.527755430410298e-06, + "loss": 0.958, + "step": 519 + }, + { + "epoch": 0.7525325615050651, + "grad_norm": 1.2538808584213257, + "learning_rate": 3.523732904263878e-06, + "loss": 0.9758, + "step": 520 + }, + { + "epoch": 0.7525325615050651, + "eval_loss": 1.085405707359314, + "eval_runtime": 23.6789, + "eval_samples_per_second": 42.232, + "eval_steps_per_second": 2.661, + "step": 520 + }, + { + "epoch": 0.7539797395079595, + "grad_norm": 1.355237603187561, + "learning_rate": 3.519710378117458e-06, + "loss": 1.0104, + "step": 521 + }, + { + "epoch": 0.7554269175108539, + "grad_norm": 1.4878712892532349, + "learning_rate": 3.515687851971038e-06, + "loss": 0.9645, + "step": 522 + }, + { + "epoch": 0.7568740955137482, + "grad_norm": 1.3971890211105347, + "learning_rate": 3.511665325824618e-06, + "loss": 1.0162, + "step": 523 + }, + { + "epoch": 0.7583212735166426, + "grad_norm": 1.400408387184143, + "learning_rate": 3.507642799678198e-06, + "loss": 0.9893, + "step": 524 + }, + { + "epoch": 0.7597684515195369, + "grad_norm": 1.4196466207504272, + "learning_rate": 3.5036202735317783e-06, + "loss": 0.9837, + "step": 525 + }, + { + "epoch": 0.7612156295224313, + "grad_norm": 1.326009750366211, + "learning_rate": 3.4995977473853584e-06, + "loss": 0.9977, + "step": 526 + }, + { + "epoch": 0.7626628075253257, + "grad_norm": 1.3473025560379028, + "learning_rate": 3.495575221238938e-06, + "loss": 1.005, + "step": 527 + }, + { + "epoch": 0.76410998552822, + "grad_norm": 1.392988681793213, + "learning_rate": 3.4915526950925182e-06, + "loss": 0.9923, + "step": 528 + }, + { + "epoch": 0.7655571635311144, + "grad_norm": 1.3056765794754028, + "learning_rate": 3.4875301689460983e-06, + "loss": 0.9866, + "step": 529 + }, + { + "epoch": 0.7670043415340086, + "grad_norm": 1.4353290796279907, + "learning_rate": 3.4835076427996785e-06, + "loss": 1.0028, + "step": 530 + }, + { + "epoch": 0.7670043415340086, + "eval_loss": 1.0846103429794312, + "eval_runtime": 23.6684, + "eval_samples_per_second": 42.25, + "eval_steps_per_second": 2.662, + "step": 530 + }, + { + "epoch": 0.768451519536903, + "grad_norm": 1.207763671875, + "learning_rate": 3.4794851166532586e-06, + "loss": 0.9784, + "step": 531 + }, + { + "epoch": 0.7698986975397974, + "grad_norm": 1.2452714443206787, + "learning_rate": 3.4754625905068383e-06, + "loss": 0.9725, + "step": 532 + }, + { + "epoch": 0.7713458755426917, + "grad_norm": 1.3213024139404297, + "learning_rate": 3.4714400643604184e-06, + "loss": 0.9965, + "step": 533 + }, + { + "epoch": 0.7727930535455861, + "grad_norm": 1.3085819482803345, + "learning_rate": 3.4674175382139986e-06, + "loss": 0.9834, + "step": 534 + }, + { + "epoch": 0.7742402315484804, + "grad_norm": 1.3090100288391113, + "learning_rate": 3.4633950120675787e-06, + "loss": 1.0003, + "step": 535 + }, + { + "epoch": 0.7756874095513748, + "grad_norm": 1.3375816345214844, + "learning_rate": 3.4593724859211584e-06, + "loss": 0.9997, + "step": 536 + }, + { + "epoch": 0.7771345875542692, + "grad_norm": 1.3109349012374878, + "learning_rate": 3.4553499597747385e-06, + "loss": 0.9844, + "step": 537 + }, + { + "epoch": 0.7785817655571635, + "grad_norm": 1.408186674118042, + "learning_rate": 3.4513274336283186e-06, + "loss": 0.9988, + "step": 538 + }, + { + "epoch": 0.7800289435600579, + "grad_norm": 1.2801100015640259, + "learning_rate": 3.4473049074818988e-06, + "loss": 0.9787, + "step": 539 + }, + { + "epoch": 0.7814761215629522, + "grad_norm": 1.4009971618652344, + "learning_rate": 3.443282381335479e-06, + "loss": 1.0018, + "step": 540 + }, + { + "epoch": 0.7814761215629522, + "eval_loss": 1.083585262298584, + "eval_runtime": 23.7133, + "eval_samples_per_second": 42.171, + "eval_steps_per_second": 2.657, + "step": 540 + }, + { + "epoch": 0.7829232995658466, + "grad_norm": 1.3215641975402832, + "learning_rate": 3.4392598551890586e-06, + "loss": 1.014, + "step": 541 + }, + { + "epoch": 0.784370477568741, + "grad_norm": 1.3028985261917114, + "learning_rate": 3.4352373290426387e-06, + "loss": 0.9895, + "step": 542 + }, + { + "epoch": 0.7858176555716353, + "grad_norm": 1.5162384510040283, + "learning_rate": 3.431214802896219e-06, + "loss": 0.9776, + "step": 543 + }, + { + "epoch": 0.7872648335745297, + "grad_norm": 1.3252856731414795, + "learning_rate": 3.427192276749799e-06, + "loss": 1.0036, + "step": 544 + }, + { + "epoch": 0.788712011577424, + "grad_norm": 1.2956281900405884, + "learning_rate": 3.423169750603379e-06, + "loss": 0.9587, + "step": 545 + }, + { + "epoch": 0.7901591895803184, + "grad_norm": 1.4463497400283813, + "learning_rate": 3.419147224456959e-06, + "loss": 1.0052, + "step": 546 + }, + { + "epoch": 0.7916063675832128, + "grad_norm": 1.4329155683517456, + "learning_rate": 3.415124698310539e-06, + "loss": 0.9858, + "step": 547 + }, + { + "epoch": 0.7930535455861071, + "grad_norm": 1.3873951435089111, + "learning_rate": 3.411102172164119e-06, + "loss": 0.9847, + "step": 548 + }, + { + "epoch": 0.7945007235890015, + "grad_norm": 1.4683536291122437, + "learning_rate": 3.407079646017699e-06, + "loss": 1.0052, + "step": 549 + }, + { + "epoch": 0.7959479015918958, + "grad_norm": 1.3398445844650269, + "learning_rate": 3.4030571198712793e-06, + "loss": 0.9924, + "step": 550 + }, + { + "epoch": 0.7959479015918958, + "eval_loss": 1.0833395719528198, + "eval_runtime": 23.7487, + "eval_samples_per_second": 42.108, + "eval_steps_per_second": 2.653, + "step": 550 + }, + { + "epoch": 0.7973950795947902, + "grad_norm": 1.5097326040267944, + "learning_rate": 3.399034593724859e-06, + "loss": 0.9674, + "step": 551 + }, + { + "epoch": 0.7988422575976846, + "grad_norm": 1.5294888019561768, + "learning_rate": 3.395012067578439e-06, + "loss": 1.0153, + "step": 552 + }, + { + "epoch": 0.8002894356005789, + "grad_norm": 1.3826487064361572, + "learning_rate": 3.3909895414320197e-06, + "loss": 0.9757, + "step": 553 + }, + { + "epoch": 0.8017366136034733, + "grad_norm": 1.4059821367263794, + "learning_rate": 3.3869670152856e-06, + "loss": 0.9682, + "step": 554 + }, + { + "epoch": 0.8031837916063675, + "grad_norm": 1.2964725494384766, + "learning_rate": 3.38294448913918e-06, + "loss": 0.957, + "step": 555 + }, + { + "epoch": 0.804630969609262, + "grad_norm": 1.338996171951294, + "learning_rate": 3.37892196299276e-06, + "loss": 0.9678, + "step": 556 + }, + { + "epoch": 0.8060781476121563, + "grad_norm": 1.3852205276489258, + "learning_rate": 3.3748994368463402e-06, + "loss": 0.9711, + "step": 557 + }, + { + "epoch": 0.8075253256150506, + "grad_norm": 1.4509114027023315, + "learning_rate": 3.37087691069992e-06, + "loss": 0.9764, + "step": 558 + }, + { + "epoch": 0.808972503617945, + "grad_norm": 1.384798288345337, + "learning_rate": 3.3668543845535e-06, + "loss": 1.0201, + "step": 559 + }, + { + "epoch": 0.8104196816208393, + "grad_norm": 1.2645292282104492, + "learning_rate": 3.36283185840708e-06, + "loss": 0.9975, + "step": 560 + }, + { + "epoch": 0.8104196816208393, + "eval_loss": 1.0793037414550781, + "eval_runtime": 23.7168, + "eval_samples_per_second": 42.164, + "eval_steps_per_second": 2.656, + "step": 560 + }, + { + "epoch": 0.8118668596237337, + "grad_norm": 1.310092806816101, + "learning_rate": 3.3588093322606603e-06, + "loss": 1.0135, + "step": 561 + }, + { + "epoch": 0.8133140376266281, + "grad_norm": 1.388508915901184, + "learning_rate": 3.3547868061142404e-06, + "loss": 0.9677, + "step": 562 + }, + { + "epoch": 0.8147612156295224, + "grad_norm": 1.2599003314971924, + "learning_rate": 3.35076427996782e-06, + "loss": 0.9778, + "step": 563 + }, + { + "epoch": 0.8162083936324168, + "grad_norm": 1.2952570915222168, + "learning_rate": 3.3467417538214003e-06, + "loss": 0.9801, + "step": 564 + }, + { + "epoch": 0.8176555716353111, + "grad_norm": 1.419215440750122, + "learning_rate": 3.3427192276749804e-06, + "loss": 0.9819, + "step": 565 + }, + { + "epoch": 0.8191027496382055, + "grad_norm": 1.456234097480774, + "learning_rate": 3.3386967015285605e-06, + "loss": 0.9717, + "step": 566 + }, + { + "epoch": 0.8205499276410999, + "grad_norm": 1.352244257926941, + "learning_rate": 3.3346741753821407e-06, + "loss": 0.9775, + "step": 567 + }, + { + "epoch": 0.8219971056439942, + "grad_norm": 1.458857536315918, + "learning_rate": 3.3306516492357204e-06, + "loss": 0.983, + "step": 568 + }, + { + "epoch": 0.8234442836468886, + "grad_norm": 1.3408229351043701, + "learning_rate": 3.3266291230893005e-06, + "loss": 1.0009, + "step": 569 + }, + { + "epoch": 0.8248914616497829, + "grad_norm": 1.2735706567764282, + "learning_rate": 3.3226065969428806e-06, + "loss": 1.0, + "step": 570 + }, + { + "epoch": 0.8248914616497829, + "eval_loss": 1.078357458114624, + "eval_runtime": 23.7336, + "eval_samples_per_second": 42.134, + "eval_steps_per_second": 2.654, + "step": 570 + }, + { + "epoch": 0.8263386396526773, + "grad_norm": 1.352004051208496, + "learning_rate": 3.3185840707964607e-06, + "loss": 0.9951, + "step": 571 + }, + { + "epoch": 0.8277858176555717, + "grad_norm": 1.3458831310272217, + "learning_rate": 3.3145615446500404e-06, + "loss": 1.0376, + "step": 572 + }, + { + "epoch": 0.829232995658466, + "grad_norm": 1.2867108583450317, + "learning_rate": 3.3105390185036206e-06, + "loss": 0.97, + "step": 573 + }, + { + "epoch": 0.8306801736613604, + "grad_norm": 1.3911094665527344, + "learning_rate": 3.3065164923572007e-06, + "loss": 0.9871, + "step": 574 + }, + { + "epoch": 0.8321273516642547, + "grad_norm": 1.3022605180740356, + "learning_rate": 3.302493966210781e-06, + "loss": 0.9624, + "step": 575 + }, + { + "epoch": 0.8335745296671491, + "grad_norm": 1.357945442199707, + "learning_rate": 3.298471440064361e-06, + "loss": 0.9786, + "step": 576 + }, + { + "epoch": 0.8350217076700435, + "grad_norm": 1.2639540433883667, + "learning_rate": 3.2944489139179407e-06, + "loss": 0.9322, + "step": 577 + }, + { + "epoch": 0.8364688856729378, + "grad_norm": 1.3323416709899902, + "learning_rate": 3.2904263877715208e-06, + "loss": 0.968, + "step": 578 + }, + { + "epoch": 0.8379160636758322, + "grad_norm": 1.3907710313796997, + "learning_rate": 3.286403861625101e-06, + "loss": 0.9843, + "step": 579 + }, + { + "epoch": 0.8393632416787264, + "grad_norm": 1.313252329826355, + "learning_rate": 3.282381335478681e-06, + "loss": 1.0012, + "step": 580 + }, + { + "epoch": 0.8393632416787264, + "eval_loss": 1.0752770900726318, + "eval_runtime": 23.728, + "eval_samples_per_second": 42.144, + "eval_steps_per_second": 2.655, + "step": 580 + }, + { + "epoch": 0.8408104196816208, + "grad_norm": 1.3566845655441284, + "learning_rate": 3.278358809332261e-06, + "loss": 0.9567, + "step": 581 + }, + { + "epoch": 0.8422575976845152, + "grad_norm": 1.3386502265930176, + "learning_rate": 3.274336283185841e-06, + "loss": 0.9526, + "step": 582 + }, + { + "epoch": 0.8437047756874095, + "grad_norm": 1.3104040622711182, + "learning_rate": 3.270313757039421e-06, + "loss": 1.0018, + "step": 583 + }, + { + "epoch": 0.8451519536903039, + "grad_norm": 1.3594677448272705, + "learning_rate": 3.266291230893001e-06, + "loss": 1.0088, + "step": 584 + }, + { + "epoch": 0.8465991316931982, + "grad_norm": 1.3568425178527832, + "learning_rate": 3.2622687047465813e-06, + "loss": 0.9564, + "step": 585 + }, + { + "epoch": 0.8480463096960926, + "grad_norm": 1.291952133178711, + "learning_rate": 3.2582461786001614e-06, + "loss": 0.9846, + "step": 586 + }, + { + "epoch": 0.849493487698987, + "grad_norm": 1.2595186233520508, + "learning_rate": 3.254223652453741e-06, + "loss": 0.9644, + "step": 587 + }, + { + "epoch": 0.8509406657018813, + "grad_norm": 1.3498653173446655, + "learning_rate": 3.2502011263073212e-06, + "loss": 0.9959, + "step": 588 + }, + { + "epoch": 0.8523878437047757, + "grad_norm": 1.375861644744873, + "learning_rate": 3.2461786001609013e-06, + "loss": 0.9976, + "step": 589 + }, + { + "epoch": 0.85383502170767, + "grad_norm": 1.33830726146698, + "learning_rate": 3.2421560740144815e-06, + "loss": 1.0093, + "step": 590 + }, + { + "epoch": 0.85383502170767, + "eval_loss": 1.0760489702224731, + "eval_runtime": 23.7052, + "eval_samples_per_second": 42.185, + "eval_steps_per_second": 2.658, + "step": 590 + }, + { + "epoch": 0.8552821997105644, + "grad_norm": 1.3555772304534912, + "learning_rate": 3.2381335478680616e-06, + "loss": 0.9752, + "step": 591 + }, + { + "epoch": 0.8567293777134588, + "grad_norm": 1.3533320426940918, + "learning_rate": 3.2341110217216413e-06, + "loss": 0.9804, + "step": 592 + }, + { + "epoch": 0.8581765557163531, + "grad_norm": 1.3967736959457397, + "learning_rate": 3.2300884955752214e-06, + "loss": 1.0062, + "step": 593 + }, + { + "epoch": 0.8596237337192475, + "grad_norm": 1.383620262145996, + "learning_rate": 3.2260659694288016e-06, + "loss": 0.956, + "step": 594 + }, + { + "epoch": 0.8610709117221418, + "grad_norm": 1.4314731359481812, + "learning_rate": 3.2220434432823817e-06, + "loss": 0.9674, + "step": 595 + }, + { + "epoch": 0.8625180897250362, + "grad_norm": 1.4000535011291504, + "learning_rate": 3.218020917135962e-06, + "loss": 0.9836, + "step": 596 + }, + { + "epoch": 0.8639652677279306, + "grad_norm": 1.341631531715393, + "learning_rate": 3.2139983909895415e-06, + "loss": 0.9854, + "step": 597 + }, + { + "epoch": 0.8654124457308249, + "grad_norm": 1.3896502256393433, + "learning_rate": 3.2099758648431216e-06, + "loss": 0.9797, + "step": 598 + }, + { + "epoch": 0.8668596237337193, + "grad_norm": 1.5060045719146729, + "learning_rate": 3.2059533386967018e-06, + "loss": 0.9637, + "step": 599 + }, + { + "epoch": 0.8683068017366136, + "grad_norm": 1.289658546447754, + "learning_rate": 3.201930812550282e-06, + "loss": 0.9818, + "step": 600 + }, + { + "epoch": 0.8683068017366136, + "eval_loss": 1.076809287071228, + "eval_runtime": 23.77, + "eval_samples_per_second": 42.07, + "eval_steps_per_second": 2.65, + "step": 600 + }, + { + "epoch": 0.869753979739508, + "grad_norm": 1.320397973060608, + "learning_rate": 3.197908286403862e-06, + "loss": 0.9925, + "step": 601 + }, + { + "epoch": 0.8712011577424024, + "grad_norm": 1.4017888307571411, + "learning_rate": 3.1938857602574417e-06, + "loss": 0.9566, + "step": 602 + }, + { + "epoch": 0.8726483357452967, + "grad_norm": 1.3646881580352783, + "learning_rate": 3.189863234111022e-06, + "loss": 0.9873, + "step": 603 + }, + { + "epoch": 0.874095513748191, + "grad_norm": 1.3507118225097656, + "learning_rate": 3.185840707964602e-06, + "loss": 0.959, + "step": 604 + }, + { + "epoch": 0.8755426917510853, + "grad_norm": 1.3340078592300415, + "learning_rate": 3.181818181818182e-06, + "loss": 0.9729, + "step": 605 + }, + { + "epoch": 0.8769898697539797, + "grad_norm": 1.3939846754074097, + "learning_rate": 3.1777956556717622e-06, + "loss": 0.9695, + "step": 606 + }, + { + "epoch": 0.8784370477568741, + "grad_norm": 1.3288594484329224, + "learning_rate": 3.173773129525342e-06, + "loss": 0.9396, + "step": 607 + }, + { + "epoch": 0.8798842257597684, + "grad_norm": 1.3205933570861816, + "learning_rate": 3.169750603378922e-06, + "loss": 0.9552, + "step": 608 + }, + { + "epoch": 0.8813314037626628, + "grad_norm": 1.4129999876022339, + "learning_rate": 3.165728077232502e-06, + "loss": 0.9726, + "step": 609 + }, + { + "epoch": 0.8827785817655571, + "grad_norm": 1.3209491968154907, + "learning_rate": 3.1617055510860823e-06, + "loss": 0.9313, + "step": 610 + }, + { + "epoch": 0.8827785817655571, + "eval_loss": 1.0764902830123901, + "eval_runtime": 23.6284, + "eval_samples_per_second": 42.322, + "eval_steps_per_second": 2.666, + "step": 610 + }, + { + "epoch": 0.8842257597684515, + "grad_norm": 1.4997881650924683, + "learning_rate": 3.1576830249396625e-06, + "loss": 0.9667, + "step": 611 + }, + { + "epoch": 0.8856729377713459, + "grad_norm": 1.3630247116088867, + "learning_rate": 3.153660498793242e-06, + "loss": 0.9694, + "step": 612 + }, + { + "epoch": 0.8871201157742402, + "grad_norm": 1.3162708282470703, + "learning_rate": 3.1496379726468223e-06, + "loss": 0.9902, + "step": 613 + }, + { + "epoch": 0.8885672937771346, + "grad_norm": 1.363140344619751, + "learning_rate": 3.1456154465004024e-06, + "loss": 0.944, + "step": 614 + }, + { + "epoch": 0.8900144717800289, + "grad_norm": 1.335150122642517, + "learning_rate": 3.1415929203539825e-06, + "loss": 0.9682, + "step": 615 + }, + { + "epoch": 0.8914616497829233, + "grad_norm": 1.3751360177993774, + "learning_rate": 3.1375703942075627e-06, + "loss": 0.9853, + "step": 616 + }, + { + "epoch": 0.8929088277858177, + "grad_norm": 1.395434021949768, + "learning_rate": 3.1335478680611424e-06, + "loss": 0.9766, + "step": 617 + }, + { + "epoch": 0.894356005788712, + "grad_norm": 1.3107879161834717, + "learning_rate": 3.1295253419147225e-06, + "loss": 0.9743, + "step": 618 + }, + { + "epoch": 0.8958031837916064, + "grad_norm": 1.4136391878128052, + "learning_rate": 3.1255028157683026e-06, + "loss": 0.9943, + "step": 619 + }, + { + "epoch": 0.8972503617945007, + "grad_norm": 1.3775230646133423, + "learning_rate": 3.1214802896218828e-06, + "loss": 1.0025, + "step": 620 + }, + { + "epoch": 0.8972503617945007, + "eval_loss": 1.073117971420288, + "eval_runtime": 23.8261, + "eval_samples_per_second": 41.971, + "eval_steps_per_second": 2.644, + "step": 620 + }, + { + "epoch": 0.8986975397973951, + "grad_norm": 1.379296064376831, + "learning_rate": 3.117457763475463e-06, + "loss": 0.9994, + "step": 621 + }, + { + "epoch": 0.9001447178002895, + "grad_norm": 1.3880268335342407, + "learning_rate": 3.1134352373290426e-06, + "loss": 0.9897, + "step": 622 + }, + { + "epoch": 0.9015918958031838, + "grad_norm": 1.347123622894287, + "learning_rate": 3.1094127111826227e-06, + "loss": 1.0066, + "step": 623 + }, + { + "epoch": 0.9030390738060782, + "grad_norm": 1.3120102882385254, + "learning_rate": 3.105390185036203e-06, + "loss": 0.959, + "step": 624 + }, + { + "epoch": 0.9044862518089725, + "grad_norm": 1.2928720712661743, + "learning_rate": 3.101367658889783e-06, + "loss": 0.9406, + "step": 625 + }, + { + "epoch": 0.9059334298118669, + "grad_norm": 1.4569941759109497, + "learning_rate": 3.097345132743363e-06, + "loss": 0.9833, + "step": 626 + }, + { + "epoch": 0.9073806078147613, + "grad_norm": 1.3429334163665771, + "learning_rate": 3.093322606596943e-06, + "loss": 1.0093, + "step": 627 + }, + { + "epoch": 0.9088277858176556, + "grad_norm": 1.3633230924606323, + "learning_rate": 3.089300080450523e-06, + "loss": 0.9661, + "step": 628 + }, + { + "epoch": 0.91027496382055, + "grad_norm": 1.467278242111206, + "learning_rate": 3.085277554304103e-06, + "loss": 0.973, + "step": 629 + }, + { + "epoch": 0.9117221418234442, + "grad_norm": 1.322097897529602, + "learning_rate": 3.081255028157683e-06, + "loss": 0.9205, + "step": 630 + }, + { + "epoch": 0.9117221418234442, + "eval_loss": 1.0719982385635376, + "eval_runtime": 23.7391, + "eval_samples_per_second": 42.125, + "eval_steps_per_second": 2.654, + "step": 630 + }, + { + "epoch": 0.9131693198263386, + "grad_norm": 1.3843573331832886, + "learning_rate": 3.0772325020112633e-06, + "loss": 1.0081, + "step": 631 + }, + { + "epoch": 0.914616497829233, + "grad_norm": 1.2887941598892212, + "learning_rate": 3.073209975864843e-06, + "loss": 0.9893, + "step": 632 + }, + { + "epoch": 0.9160636758321273, + "grad_norm": 1.3732166290283203, + "learning_rate": 3.069187449718423e-06, + "loss": 0.9737, + "step": 633 + }, + { + "epoch": 0.9175108538350217, + "grad_norm": 1.3207870721817017, + "learning_rate": 3.0651649235720033e-06, + "loss": 0.9568, + "step": 634 + }, + { + "epoch": 0.918958031837916, + "grad_norm": 1.2793110609054565, + "learning_rate": 3.0611423974255834e-06, + "loss": 0.9727, + "step": 635 + }, + { + "epoch": 0.9204052098408104, + "grad_norm": 1.4250153303146362, + "learning_rate": 3.057119871279163e-06, + "loss": 0.9635, + "step": 636 + }, + { + "epoch": 0.9218523878437048, + "grad_norm": 1.3892358541488647, + "learning_rate": 3.0530973451327432e-06, + "loss": 0.9647, + "step": 637 + }, + { + "epoch": 0.9232995658465991, + "grad_norm": 1.4442740678787231, + "learning_rate": 3.0490748189863234e-06, + "loss": 1.0271, + "step": 638 + }, + { + "epoch": 0.9247467438494935, + "grad_norm": 1.408539891242981, + "learning_rate": 3.0450522928399035e-06, + "loss": 0.9939, + "step": 639 + }, + { + "epoch": 0.9261939218523878, + "grad_norm": 1.4245710372924805, + "learning_rate": 3.0410297666934836e-06, + "loss": 0.9888, + "step": 640 + }, + { + "epoch": 0.9261939218523878, + "eval_loss": 1.073214054107666, + "eval_runtime": 23.7269, + "eval_samples_per_second": 42.146, + "eval_steps_per_second": 2.655, + "step": 640 + }, + { + "epoch": 0.9276410998552822, + "grad_norm": 1.4141292572021484, + "learning_rate": 3.0370072405470633e-06, + "loss": 0.9644, + "step": 641 + }, + { + "epoch": 0.9290882778581766, + "grad_norm": 1.3800727128982544, + "learning_rate": 3.0329847144006443e-06, + "loss": 0.9871, + "step": 642 + }, + { + "epoch": 0.9305354558610709, + "grad_norm": 1.3321155309677124, + "learning_rate": 3.028962188254224e-06, + "loss": 0.985, + "step": 643 + }, + { + "epoch": 0.9319826338639653, + "grad_norm": 1.4205679893493652, + "learning_rate": 3.024939662107804e-06, + "loss": 0.9675, + "step": 644 + }, + { + "epoch": 0.9334298118668596, + "grad_norm": 1.4397610425949097, + "learning_rate": 3.0209171359613843e-06, + "loss": 0.9742, + "step": 645 + }, + { + "epoch": 0.934876989869754, + "grad_norm": 1.3359534740447998, + "learning_rate": 3.0168946098149644e-06, + "loss": 0.9501, + "step": 646 + }, + { + "epoch": 0.9363241678726484, + "grad_norm": 1.331691861152649, + "learning_rate": 3.0128720836685445e-06, + "loss": 0.9731, + "step": 647 + }, + { + "epoch": 0.9377713458755427, + "grad_norm": 1.332754135131836, + "learning_rate": 3.0088495575221242e-06, + "loss": 0.9533, + "step": 648 + }, + { + "epoch": 0.9392185238784371, + "grad_norm": 1.2960954904556274, + "learning_rate": 3.0048270313757043e-06, + "loss": 0.9918, + "step": 649 + }, + { + "epoch": 0.9406657018813314, + "grad_norm": 1.3787429332733154, + "learning_rate": 3.0008045052292845e-06, + "loss": 0.9565, + "step": 650 + }, + { + "epoch": 0.9406657018813314, + "eval_loss": 1.0720397233963013, + "eval_runtime": 23.6826, + "eval_samples_per_second": 42.225, + "eval_steps_per_second": 2.66, + "step": 650 + }, + { + "epoch": 0.9421128798842258, + "grad_norm": 1.3344447612762451, + "learning_rate": 2.9967819790828646e-06, + "loss": 0.9655, + "step": 651 + }, + { + "epoch": 0.9435600578871202, + "grad_norm": 1.3047369718551636, + "learning_rate": 2.9927594529364447e-06, + "loss": 0.9544, + "step": 652 + }, + { + "epoch": 0.9450072358900145, + "grad_norm": 1.2787832021713257, + "learning_rate": 2.9887369267900244e-06, + "loss": 0.9657, + "step": 653 + }, + { + "epoch": 0.9464544138929089, + "grad_norm": 1.295332431793213, + "learning_rate": 2.9847144006436046e-06, + "loss": 0.9744, + "step": 654 + }, + { + "epoch": 0.9479015918958031, + "grad_norm": 1.397257685661316, + "learning_rate": 2.9806918744971847e-06, + "loss": 1.0082, + "step": 655 + }, + { + "epoch": 0.9493487698986975, + "grad_norm": 1.378974199295044, + "learning_rate": 2.976669348350765e-06, + "loss": 1.0068, + "step": 656 + }, + { + "epoch": 0.9507959479015919, + "grad_norm": 1.4262326955795288, + "learning_rate": 2.972646822204345e-06, + "loss": 0.9574, + "step": 657 + }, + { + "epoch": 0.9522431259044862, + "grad_norm": 1.4602223634719849, + "learning_rate": 2.9686242960579246e-06, + "loss": 0.9505, + "step": 658 + }, + { + "epoch": 0.9536903039073806, + "grad_norm": 1.3622018098831177, + "learning_rate": 2.9646017699115048e-06, + "loss": 0.9658, + "step": 659 + }, + { + "epoch": 0.9551374819102749, + "grad_norm": 1.3139601945877075, + "learning_rate": 2.960579243765085e-06, + "loss": 0.9516, + "step": 660 + }, + { + "epoch": 0.9551374819102749, + "eval_loss": 1.0695888996124268, + "eval_runtime": 23.7978, + "eval_samples_per_second": 42.021, + "eval_steps_per_second": 2.647, + "step": 660 + }, + { + "epoch": 0.9565846599131693, + "grad_norm": 1.3879914283752441, + "learning_rate": 2.956556717618665e-06, + "loss": 0.9992, + "step": 661 + }, + { + "epoch": 0.9580318379160637, + "grad_norm": 1.3674969673156738, + "learning_rate": 2.952534191472245e-06, + "loss": 0.9588, + "step": 662 + }, + { + "epoch": 0.959479015918958, + "grad_norm": 1.3705657720565796, + "learning_rate": 2.948511665325825e-06, + "loss": 0.965, + "step": 663 + }, + { + "epoch": 0.9609261939218524, + "grad_norm": 1.4762684106826782, + "learning_rate": 2.944489139179405e-06, + "loss": 0.9907, + "step": 664 + }, + { + "epoch": 0.9623733719247467, + "grad_norm": 1.3814926147460938, + "learning_rate": 2.940466613032985e-06, + "loss": 0.9055, + "step": 665 + }, + { + "epoch": 0.9638205499276411, + "grad_norm": 1.3113678693771362, + "learning_rate": 2.9364440868865652e-06, + "loss": 0.9915, + "step": 666 + }, + { + "epoch": 0.9652677279305355, + "grad_norm": 1.3303974866867065, + "learning_rate": 2.9324215607401454e-06, + "loss": 0.9703, + "step": 667 + }, + { + "epoch": 0.9667149059334298, + "grad_norm": 1.4883965253829956, + "learning_rate": 2.928399034593725e-06, + "loss": 0.9943, + "step": 668 + }, + { + "epoch": 0.9681620839363242, + "grad_norm": 1.3305892944335938, + "learning_rate": 2.924376508447305e-06, + "loss": 0.9673, + "step": 669 + }, + { + "epoch": 0.9696092619392185, + "grad_norm": 1.3305500745773315, + "learning_rate": 2.9203539823008853e-06, + "loss": 0.9199, + "step": 670 + }, + { + "epoch": 0.9696092619392185, + "eval_loss": 1.0688841342926025, + "eval_runtime": 23.8181, + "eval_samples_per_second": 41.985, + "eval_steps_per_second": 2.645, + "step": 670 + }, + { + "epoch": 0.9710564399421129, + "grad_norm": 1.425545573234558, + "learning_rate": 2.9163314561544655e-06, + "loss": 0.9723, + "step": 671 + }, + { + "epoch": 0.9725036179450073, + "grad_norm": 1.3736907243728638, + "learning_rate": 2.912308930008045e-06, + "loss": 0.946, + "step": 672 + }, + { + "epoch": 0.9739507959479016, + "grad_norm": 1.326892614364624, + "learning_rate": 2.9082864038616253e-06, + "loss": 0.9676, + "step": 673 + }, + { + "epoch": 0.975397973950796, + "grad_norm": 1.3270180225372314, + "learning_rate": 2.9042638777152054e-06, + "loss": 0.928, + "step": 674 + }, + { + "epoch": 0.9768451519536903, + "grad_norm": 1.4843968152999878, + "learning_rate": 2.9002413515687855e-06, + "loss": 0.9555, + "step": 675 + }, + { + "epoch": 0.9782923299565847, + "grad_norm": 1.4869716167449951, + "learning_rate": 2.8962188254223657e-06, + "loss": 0.9727, + "step": 676 + }, + { + "epoch": 0.9797395079594791, + "grad_norm": 1.2844302654266357, + "learning_rate": 2.8921962992759454e-06, + "loss": 0.9179, + "step": 677 + }, + { + "epoch": 0.9811866859623734, + "grad_norm": 1.4501268863677979, + "learning_rate": 2.8881737731295255e-06, + "loss": 0.9829, + "step": 678 + }, + { + "epoch": 0.9826338639652678, + "grad_norm": 1.4102064371109009, + "learning_rate": 2.8841512469831056e-06, + "loss": 0.979, + "step": 679 + }, + { + "epoch": 0.984081041968162, + "grad_norm": 1.3757656812667847, + "learning_rate": 2.8801287208366858e-06, + "loss": 1.0051, + "step": 680 + }, + { + "epoch": 0.984081041968162, + "eval_loss": 1.0675874948501587, + "eval_runtime": 23.8087, + "eval_samples_per_second": 42.001, + "eval_steps_per_second": 2.646, + "step": 680 + }, + { + "epoch": 0.9855282199710564, + "grad_norm": 1.3570847511291504, + "learning_rate": 2.876106194690266e-06, + "loss": 0.9784, + "step": 681 + }, + { + "epoch": 0.9869753979739508, + "grad_norm": 1.5033375024795532, + "learning_rate": 2.8720836685438456e-06, + "loss": 0.9318, + "step": 682 + }, + { + "epoch": 0.9884225759768451, + "grad_norm": 1.3659757375717163, + "learning_rate": 2.8680611423974257e-06, + "loss": 0.9907, + "step": 683 + }, + { + "epoch": 0.9898697539797395, + "grad_norm": 1.3127025365829468, + "learning_rate": 2.864038616251006e-06, + "loss": 0.9513, + "step": 684 + }, + { + "epoch": 0.9913169319826338, + "grad_norm": 1.4611915349960327, + "learning_rate": 2.860016090104586e-06, + "loss": 0.9566, + "step": 685 + }, + { + "epoch": 0.9927641099855282, + "grad_norm": 1.3685567378997803, + "learning_rate": 2.855993563958166e-06, + "loss": 0.9754, + "step": 686 + }, + { + "epoch": 0.9942112879884226, + "grad_norm": 1.365681529045105, + "learning_rate": 2.851971037811746e-06, + "loss": 0.9593, + "step": 687 + }, + { + "epoch": 0.9956584659913169, + "grad_norm": 1.4232324361801147, + "learning_rate": 2.847948511665326e-06, + "loss": 0.9492, + "step": 688 + }, + { + "epoch": 0.9971056439942113, + "grad_norm": 1.4242656230926514, + "learning_rate": 2.843925985518906e-06, + "loss": 0.9642, + "step": 689 + }, + { + "epoch": 0.9985528219971056, + "grad_norm": 1.3681212663650513, + "learning_rate": 2.839903459372486e-06, + "loss": 1.0, + "step": 690 + }, + { + "epoch": 0.9985528219971056, + "eval_loss": 1.0660607814788818, + "eval_runtime": 23.7888, + "eval_samples_per_second": 42.037, + "eval_steps_per_second": 2.648, + "step": 690 + }, + { + "epoch": 1.0, + "grad_norm": 1.3575904369354248, + "learning_rate": 2.8358809332260663e-06, + "loss": 0.9455, + "step": 691 + }, + { + "epoch": 1.0014471780028944, + "grad_norm": 1.3744746446609497, + "learning_rate": 2.831858407079646e-06, + "loss": 0.9389, + "step": 692 + }, + { + "epoch": 1.0028943560057888, + "grad_norm": 1.4549713134765625, + "learning_rate": 2.827835880933226e-06, + "loss": 0.9338, + "step": 693 + }, + { + "epoch": 1.004341534008683, + "grad_norm": 1.35031259059906, + "learning_rate": 2.8238133547868063e-06, + "loss": 0.9697, + "step": 694 + }, + { + "epoch": 1.0057887120115774, + "grad_norm": 1.56009840965271, + "learning_rate": 2.8197908286403864e-06, + "loss": 0.9818, + "step": 695 + }, + { + "epoch": 1.0072358900144718, + "grad_norm": 1.650952935218811, + "learning_rate": 2.8157683024939665e-06, + "loss": 0.9722, + "step": 696 + }, + { + "epoch": 1.0086830680173662, + "grad_norm": 1.4660834074020386, + "learning_rate": 2.8117457763475462e-06, + "loss": 0.9466, + "step": 697 + }, + { + "epoch": 1.0101302460202606, + "grad_norm": 1.504908561706543, + "learning_rate": 2.8077232502011264e-06, + "loss": 0.9345, + "step": 698 + }, + { + "epoch": 1.0115774240231548, + "grad_norm": 1.433066725730896, + "learning_rate": 2.8037007240547065e-06, + "loss": 0.9364, + "step": 699 + }, + { + "epoch": 1.0130246020260492, + "grad_norm": 1.4966166019439697, + "learning_rate": 2.7996781979082866e-06, + "loss": 1.0035, + "step": 700 + }, + { + "epoch": 1.0130246020260492, + "eval_loss": 1.0664418935775757, + "eval_runtime": 23.8038, + "eval_samples_per_second": 42.01, + "eval_steps_per_second": 2.647, + "step": 700 + }, + { + "epoch": 1.0144717800289436, + "grad_norm": 1.4819644689559937, + "learning_rate": 2.7956556717618667e-06, + "loss": 0.9903, + "step": 701 + }, + { + "epoch": 1.015918958031838, + "grad_norm": 1.4150784015655518, + "learning_rate": 2.7916331456154465e-06, + "loss": 0.9842, + "step": 702 + }, + { + "epoch": 1.0173661360347324, + "grad_norm": 1.464428186416626, + "learning_rate": 2.7876106194690266e-06, + "loss": 0.9787, + "step": 703 + }, + { + "epoch": 1.0188133140376265, + "grad_norm": 1.383646845817566, + "learning_rate": 2.7835880933226067e-06, + "loss": 0.9589, + "step": 704 + }, + { + "epoch": 1.020260492040521, + "grad_norm": 1.4479550123214722, + "learning_rate": 2.779565567176187e-06, + "loss": 0.9987, + "step": 705 + }, + { + "epoch": 1.0217076700434153, + "grad_norm": 1.4371713399887085, + "learning_rate": 2.775543041029767e-06, + "loss": 0.9236, + "step": 706 + }, + { + "epoch": 1.0231548480463097, + "grad_norm": 1.3693134784698486, + "learning_rate": 2.7715205148833467e-06, + "loss": 0.9634, + "step": 707 + }, + { + "epoch": 1.0246020260492041, + "grad_norm": 1.4582874774932861, + "learning_rate": 2.767497988736927e-06, + "loss": 1.0035, + "step": 708 + }, + { + "epoch": 1.0260492040520983, + "grad_norm": 1.415440559387207, + "learning_rate": 2.763475462590507e-06, + "loss": 0.9145, + "step": 709 + }, + { + "epoch": 1.0274963820549927, + "grad_norm": 1.4009870290756226, + "learning_rate": 2.759452936444087e-06, + "loss": 0.9393, + "step": 710 + }, + { + "epoch": 1.0274963820549927, + "eval_loss": 1.0632503032684326, + "eval_runtime": 23.6971, + "eval_samples_per_second": 42.199, + "eval_steps_per_second": 2.659, + "step": 710 + }, + { + "epoch": 1.0289435600578871, + "grad_norm": 1.4486078023910522, + "learning_rate": 2.755430410297667e-06, + "loss": 0.9667, + "step": 711 + }, + { + "epoch": 1.0303907380607815, + "grad_norm": 1.4247418642044067, + "learning_rate": 2.751407884151247e-06, + "loss": 0.9547, + "step": 712 + }, + { + "epoch": 1.031837916063676, + "grad_norm": 1.436584234237671, + "learning_rate": 2.747385358004827e-06, + "loss": 0.9747, + "step": 713 + }, + { + "epoch": 1.03328509406657, + "grad_norm": 1.599297285079956, + "learning_rate": 2.743362831858407e-06, + "loss": 0.9532, + "step": 714 + }, + { + "epoch": 1.0347322720694645, + "grad_norm": 1.4135537147521973, + "learning_rate": 2.7393403057119873e-06, + "loss": 0.967, + "step": 715 + }, + { + "epoch": 1.036179450072359, + "grad_norm": 1.3846684694290161, + "learning_rate": 2.7353177795655674e-06, + "loss": 0.9657, + "step": 716 + }, + { + "epoch": 1.0376266280752533, + "grad_norm": 1.4375203847885132, + "learning_rate": 2.731295253419147e-06, + "loss": 0.9525, + "step": 717 + }, + { + "epoch": 1.0390738060781477, + "grad_norm": 1.3957698345184326, + "learning_rate": 2.7272727272727272e-06, + "loss": 0.932, + "step": 718 + }, + { + "epoch": 1.0405209840810419, + "grad_norm": 1.4869130849838257, + "learning_rate": 2.7232502011263074e-06, + "loss": 0.9386, + "step": 719 + }, + { + "epoch": 1.0419681620839363, + "grad_norm": 1.4138269424438477, + "learning_rate": 2.7192276749798875e-06, + "loss": 0.9765, + "step": 720 + }, + { + "epoch": 1.0419681620839363, + "eval_loss": 1.0653059482574463, + "eval_runtime": 23.7929, + "eval_samples_per_second": 42.029, + "eval_steps_per_second": 2.648, + "step": 720 + }, + { + "epoch": 1.0434153400868307, + "grad_norm": 1.4324676990509033, + "learning_rate": 2.7152051488334676e-06, + "loss": 0.943, + "step": 721 + }, + { + "epoch": 1.044862518089725, + "grad_norm": 1.4571518898010254, + "learning_rate": 2.7111826226870473e-06, + "loss": 0.9306, + "step": 722 + }, + { + "epoch": 1.0463096960926195, + "grad_norm": 1.3712648153305054, + "learning_rate": 2.7071600965406274e-06, + "loss": 0.9598, + "step": 723 + }, + { + "epoch": 1.0477568740955137, + "grad_norm": 1.4532585144042969, + "learning_rate": 2.7031375703942076e-06, + "loss": 0.9558, + "step": 724 + }, + { + "epoch": 1.049204052098408, + "grad_norm": 1.3885958194732666, + "learning_rate": 2.6991150442477877e-06, + "loss": 0.9112, + "step": 725 + }, + { + "epoch": 1.0506512301013025, + "grad_norm": 1.3187217712402344, + "learning_rate": 2.695092518101368e-06, + "loss": 0.8962, + "step": 726 + }, + { + "epoch": 1.0520984081041969, + "grad_norm": 1.4057087898254395, + "learning_rate": 2.6910699919549475e-06, + "loss": 0.9524, + "step": 727 + }, + { + "epoch": 1.0535455861070913, + "grad_norm": 1.4266270399093628, + "learning_rate": 2.6870474658085277e-06, + "loss": 0.9542, + "step": 728 + }, + { + "epoch": 1.0549927641099854, + "grad_norm": 1.48124361038208, + "learning_rate": 2.6830249396621078e-06, + "loss": 1.0166, + "step": 729 + }, + { + "epoch": 1.0564399421128798, + "grad_norm": 1.3916049003601074, + "learning_rate": 2.679002413515688e-06, + "loss": 0.9513, + "step": 730 + }, + { + "epoch": 1.0564399421128798, + "eval_loss": 1.0657588243484497, + "eval_runtime": 23.807, + "eval_samples_per_second": 42.004, + "eval_steps_per_second": 2.646, + "step": 730 + }, + { + "epoch": 1.0578871201157742, + "grad_norm": 1.4604647159576416, + "learning_rate": 2.6749798873692685e-06, + "loss": 0.9253, + "step": 731 + }, + { + "epoch": 1.0593342981186686, + "grad_norm": 1.39705228805542, + "learning_rate": 2.6709573612228486e-06, + "loss": 0.9524, + "step": 732 + }, + { + "epoch": 1.060781476121563, + "grad_norm": 1.5852898359298706, + "learning_rate": 2.6669348350764283e-06, + "loss": 0.9664, + "step": 733 + }, + { + "epoch": 1.0622286541244572, + "grad_norm": 1.493071436882019, + "learning_rate": 2.6629123089300084e-06, + "loss": 0.9519, + "step": 734 + }, + { + "epoch": 1.0636758321273516, + "grad_norm": 1.4261360168457031, + "learning_rate": 2.6588897827835885e-06, + "loss": 0.9033, + "step": 735 + }, + { + "epoch": 1.065123010130246, + "grad_norm": 1.373785376548767, + "learning_rate": 2.6548672566371687e-06, + "loss": 0.9506, + "step": 736 + }, + { + "epoch": 1.0665701881331404, + "grad_norm": 1.507643222808838, + "learning_rate": 2.650844730490749e-06, + "loss": 0.9496, + "step": 737 + }, + { + "epoch": 1.0680173661360348, + "grad_norm": 1.4759060144424438, + "learning_rate": 2.6468222043443285e-06, + "loss": 0.9749, + "step": 738 + }, + { + "epoch": 1.069464544138929, + "grad_norm": 1.4467190504074097, + "learning_rate": 2.6427996781979086e-06, + "loss": 0.9903, + "step": 739 + }, + { + "epoch": 1.0709117221418234, + "grad_norm": 1.4200372695922852, + "learning_rate": 2.6387771520514888e-06, + "loss": 0.9692, + "step": 740 + }, + { + "epoch": 1.0709117221418234, + "eval_loss": 1.063270926475525, + "eval_runtime": 23.7948, + "eval_samples_per_second": 42.026, + "eval_steps_per_second": 2.648, + "step": 740 + }, + { + "epoch": 1.0723589001447178, + "grad_norm": 1.4274908304214478, + "learning_rate": 2.634754625905069e-06, + "loss": 0.9093, + "step": 741 + }, + { + "epoch": 1.0738060781476122, + "grad_norm": 1.3824751377105713, + "learning_rate": 2.630732099758649e-06, + "loss": 0.9448, + "step": 742 + }, + { + "epoch": 1.0752532561505066, + "grad_norm": 1.475680947303772, + "learning_rate": 2.6267095736122287e-06, + "loss": 0.9495, + "step": 743 + }, + { + "epoch": 1.0767004341534008, + "grad_norm": 1.5366449356079102, + "learning_rate": 2.622687047465809e-06, + "loss": 0.95, + "step": 744 + }, + { + "epoch": 1.0781476121562952, + "grad_norm": 1.4599356651306152, + "learning_rate": 2.618664521319389e-06, + "loss": 0.9298, + "step": 745 + }, + { + "epoch": 1.0795947901591896, + "grad_norm": 1.3807423114776611, + "learning_rate": 2.614641995172969e-06, + "loss": 0.962, + "step": 746 + }, + { + "epoch": 1.081041968162084, + "grad_norm": 1.4419299364089966, + "learning_rate": 2.6106194690265492e-06, + "loss": 0.9669, + "step": 747 + }, + { + "epoch": 1.0824891461649784, + "grad_norm": 1.4543246030807495, + "learning_rate": 2.606596942880129e-06, + "loss": 0.9961, + "step": 748 + }, + { + "epoch": 1.0839363241678726, + "grad_norm": 1.4083486795425415, + "learning_rate": 2.602574416733709e-06, + "loss": 0.925, + "step": 749 + }, + { + "epoch": 1.085383502170767, + "grad_norm": 1.484033465385437, + "learning_rate": 2.598551890587289e-06, + "loss": 0.9567, + "step": 750 + }, + { + "epoch": 1.085383502170767, + "eval_loss": 1.0640690326690674, + "eval_runtime": 23.7787, + "eval_samples_per_second": 42.054, + "eval_steps_per_second": 2.649, + "step": 750 + }, + { + "epoch": 1.0868306801736614, + "grad_norm": 1.4217402935028076, + "learning_rate": 2.5945293644408693e-06, + "loss": 0.9577, + "step": 751 + }, + { + "epoch": 1.0882778581765558, + "grad_norm": 1.631165862083435, + "learning_rate": 2.5905068382944494e-06, + "loss": 0.9521, + "step": 752 + }, + { + "epoch": 1.0897250361794502, + "grad_norm": 1.6165910959243774, + "learning_rate": 2.586484312148029e-06, + "loss": 0.9704, + "step": 753 + }, + { + "epoch": 1.0911722141823443, + "grad_norm": 1.5343152284622192, + "learning_rate": 2.5824617860016093e-06, + "loss": 0.977, + "step": 754 + }, + { + "epoch": 1.0926193921852387, + "grad_norm": 1.43861985206604, + "learning_rate": 2.5784392598551894e-06, + "loss": 0.9777, + "step": 755 + }, + { + "epoch": 1.0940665701881331, + "grad_norm": 1.4859222173690796, + "learning_rate": 2.5744167337087695e-06, + "loss": 0.9157, + "step": 756 + }, + { + "epoch": 1.0955137481910275, + "grad_norm": 1.4163947105407715, + "learning_rate": 2.5703942075623497e-06, + "loss": 0.9647, + "step": 757 + }, + { + "epoch": 1.096960926193922, + "grad_norm": 1.3839441537857056, + "learning_rate": 2.5663716814159294e-06, + "loss": 0.9519, + "step": 758 + }, + { + "epoch": 1.0984081041968161, + "grad_norm": 1.4873661994934082, + "learning_rate": 2.5623491552695095e-06, + "loss": 0.9405, + "step": 759 + }, + { + "epoch": 1.0998552821997105, + "grad_norm": 1.6223297119140625, + "learning_rate": 2.5583266291230896e-06, + "loss": 0.9187, + "step": 760 + }, + { + "epoch": 1.0998552821997105, + "eval_loss": 1.0672178268432617, + "eval_runtime": 23.751, + "eval_samples_per_second": 42.103, + "eval_steps_per_second": 2.653, + "step": 760 + }, + { + "epoch": 1.101302460202605, + "grad_norm": 1.5472077131271362, + "learning_rate": 2.5543041029766698e-06, + "loss": 0.9497, + "step": 761 + }, + { + "epoch": 1.1027496382054993, + "grad_norm": 1.5900278091430664, + "learning_rate": 2.55028157683025e-06, + "loss": 0.9243, + "step": 762 + }, + { + "epoch": 1.1041968162083937, + "grad_norm": 1.5092447996139526, + "learning_rate": 2.5462590506838296e-06, + "loss": 0.9267, + "step": 763 + }, + { + "epoch": 1.105643994211288, + "grad_norm": 1.4817793369293213, + "learning_rate": 2.5422365245374097e-06, + "loss": 0.9778, + "step": 764 + }, + { + "epoch": 1.1070911722141823, + "grad_norm": 1.3759835958480835, + "learning_rate": 2.53821399839099e-06, + "loss": 0.949, + "step": 765 + }, + { + "epoch": 1.1085383502170767, + "grad_norm": 1.4497779607772827, + "learning_rate": 2.53419147224457e-06, + "loss": 0.9463, + "step": 766 + }, + { + "epoch": 1.109985528219971, + "grad_norm": 1.4664055109024048, + "learning_rate": 2.53016894609815e-06, + "loss": 0.9438, + "step": 767 + }, + { + "epoch": 1.1114327062228655, + "grad_norm": 1.5658576488494873, + "learning_rate": 2.52614641995173e-06, + "loss": 0.9568, + "step": 768 + }, + { + "epoch": 1.1128798842257597, + "grad_norm": 1.3818690776824951, + "learning_rate": 2.52212389380531e-06, + "loss": 0.9381, + "step": 769 + }, + { + "epoch": 1.114327062228654, + "grad_norm": 1.618514895439148, + "learning_rate": 2.51810136765889e-06, + "loss": 1.007, + "step": 770 + }, + { + "epoch": 1.114327062228654, + "eval_loss": 1.0651800632476807, + "eval_runtime": 23.7764, + "eval_samples_per_second": 42.059, + "eval_steps_per_second": 2.65, + "step": 770 + }, + { + "epoch": 1.1157742402315485, + "grad_norm": 1.444734811782837, + "learning_rate": 2.51407884151247e-06, + "loss": 0.9541, + "step": 771 + }, + { + "epoch": 1.1172214182344429, + "grad_norm": 1.4572707414627075, + "learning_rate": 2.5100563153660503e-06, + "loss": 0.9427, + "step": 772 + }, + { + "epoch": 1.1186685962373373, + "grad_norm": 1.398985505104065, + "learning_rate": 2.50603378921963e-06, + "loss": 0.9922, + "step": 773 + }, + { + "epoch": 1.1201157742402315, + "grad_norm": 1.5025098323822021, + "learning_rate": 2.50201126307321e-06, + "loss": 0.9771, + "step": 774 + }, + { + "epoch": 1.1215629522431259, + "grad_norm": 1.470844030380249, + "learning_rate": 2.4979887369267903e-06, + "loss": 0.9426, + "step": 775 + }, + { + "epoch": 1.1230101302460203, + "grad_norm": 1.4982131719589233, + "learning_rate": 2.4939662107803704e-06, + "loss": 0.955, + "step": 776 + }, + { + "epoch": 1.1244573082489147, + "grad_norm": 1.4615315198898315, + "learning_rate": 2.48994368463395e-06, + "loss": 0.9522, + "step": 777 + }, + { + "epoch": 1.125904486251809, + "grad_norm": 1.4752854108810425, + "learning_rate": 2.4859211584875302e-06, + "loss": 0.9355, + "step": 778 + }, + { + "epoch": 1.1273516642547032, + "grad_norm": 1.5418580770492554, + "learning_rate": 2.4818986323411104e-06, + "loss": 0.9731, + "step": 779 + }, + { + "epoch": 1.1287988422575976, + "grad_norm": 1.4596821069717407, + "learning_rate": 2.4778761061946905e-06, + "loss": 0.9537, + "step": 780 + }, + { + "epoch": 1.1287988422575976, + "eval_loss": 1.0653374195098877, + "eval_runtime": 23.7798, + "eval_samples_per_second": 42.052, + "eval_steps_per_second": 2.649, + "step": 780 + }, + { + "epoch": 1.130246020260492, + "grad_norm": 1.499121904373169, + "learning_rate": 2.4738535800482706e-06, + "loss": 0.9301, + "step": 781 + }, + { + "epoch": 1.1316931982633864, + "grad_norm": 1.4389591217041016, + "learning_rate": 2.4698310539018503e-06, + "loss": 0.9612, + "step": 782 + }, + { + "epoch": 1.1331403762662808, + "grad_norm": 1.4259767532348633, + "learning_rate": 2.4658085277554304e-06, + "loss": 0.9637, + "step": 783 + }, + { + "epoch": 1.134587554269175, + "grad_norm": 1.4260019063949585, + "learning_rate": 2.4617860016090106e-06, + "loss": 0.9312, + "step": 784 + }, + { + "epoch": 1.1360347322720694, + "grad_norm": 1.4691858291625977, + "learning_rate": 2.4577634754625907e-06, + "loss": 0.9739, + "step": 785 + }, + { + "epoch": 1.1374819102749638, + "grad_norm": 1.45038640499115, + "learning_rate": 2.453740949316171e-06, + "loss": 0.937, + "step": 786 + }, + { + "epoch": 1.1389290882778582, + "grad_norm": 1.4629253149032593, + "learning_rate": 2.4497184231697505e-06, + "loss": 0.9739, + "step": 787 + }, + { + "epoch": 1.1403762662807526, + "grad_norm": 1.5467466115951538, + "learning_rate": 2.4456958970233307e-06, + "loss": 0.9433, + "step": 788 + }, + { + "epoch": 1.1418234442836468, + "grad_norm": 1.4949489831924438, + "learning_rate": 2.4416733708769108e-06, + "loss": 0.9434, + "step": 789 + }, + { + "epoch": 1.1432706222865412, + "grad_norm": 1.488418459892273, + "learning_rate": 2.437650844730491e-06, + "loss": 0.957, + "step": 790 + }, + { + "epoch": 1.1432706222865412, + "eval_loss": 1.062988519668579, + "eval_runtime": 23.8174, + "eval_samples_per_second": 41.986, + "eval_steps_per_second": 2.645, + "step": 790 + }, + { + "epoch": 1.1447178002894356, + "grad_norm": 1.4383903741836548, + "learning_rate": 2.433628318584071e-06, + "loss": 0.9554, + "step": 791 + }, + { + "epoch": 1.14616497829233, + "grad_norm": 1.5889939069747925, + "learning_rate": 2.4296057924376507e-06, + "loss": 0.9473, + "step": 792 + }, + { + "epoch": 1.1476121562952244, + "grad_norm": 1.4130676984786987, + "learning_rate": 2.425583266291231e-06, + "loss": 0.9464, + "step": 793 + }, + { + "epoch": 1.1490593342981186, + "grad_norm": 1.5219330787658691, + "learning_rate": 2.421560740144811e-06, + "loss": 1.0053, + "step": 794 + }, + { + "epoch": 1.150506512301013, + "grad_norm": 1.4843977689743042, + "learning_rate": 2.417538213998391e-06, + "loss": 0.9267, + "step": 795 + }, + { + "epoch": 1.1519536903039074, + "grad_norm": 1.4772837162017822, + "learning_rate": 2.4135156878519713e-06, + "loss": 0.9539, + "step": 796 + }, + { + "epoch": 1.1534008683068018, + "grad_norm": 1.393520474433899, + "learning_rate": 2.4094931617055514e-06, + "loss": 0.927, + "step": 797 + }, + { + "epoch": 1.1548480463096962, + "grad_norm": 1.4185994863510132, + "learning_rate": 2.4054706355591315e-06, + "loss": 0.9025, + "step": 798 + }, + { + "epoch": 1.1562952243125904, + "grad_norm": 1.4406406879425049, + "learning_rate": 2.4014481094127116e-06, + "loss": 0.9444, + "step": 799 + }, + { + "epoch": 1.1577424023154848, + "grad_norm": 1.5312485694885254, + "learning_rate": 2.3974255832662913e-06, + "loss": 0.9508, + "step": 800 + }, + { + "epoch": 1.1577424023154848, + "eval_loss": 1.0626981258392334, + "eval_runtime": 23.8234, + "eval_samples_per_second": 41.976, + "eval_steps_per_second": 2.644, + "step": 800 + }, + { + "epoch": 1.1591895803183792, + "grad_norm": 1.6089097261428833, + "learning_rate": 2.3934030571198715e-06, + "loss": 1.0038, + "step": 801 + }, + { + "epoch": 1.1606367583212736, + "grad_norm": 1.5880542993545532, + "learning_rate": 2.3893805309734516e-06, + "loss": 0.9189, + "step": 802 + }, + { + "epoch": 1.162083936324168, + "grad_norm": 1.5894044637680054, + "learning_rate": 2.3853580048270317e-06, + "loss": 0.981, + "step": 803 + }, + { + "epoch": 1.1635311143270621, + "grad_norm": 1.4618388414382935, + "learning_rate": 2.381335478680612e-06, + "loss": 0.9561, + "step": 804 + }, + { + "epoch": 1.1649782923299565, + "grad_norm": 1.4470610618591309, + "learning_rate": 2.3773129525341916e-06, + "loss": 0.9428, + "step": 805 + }, + { + "epoch": 1.166425470332851, + "grad_norm": 1.443575382232666, + "learning_rate": 2.3732904263877717e-06, + "loss": 0.945, + "step": 806 + }, + { + "epoch": 1.1678726483357453, + "grad_norm": 1.524228572845459, + "learning_rate": 2.369267900241352e-06, + "loss": 0.9828, + "step": 807 + }, + { + "epoch": 1.1693198263386397, + "grad_norm": 1.570440649986267, + "learning_rate": 2.365245374094932e-06, + "loss": 0.9349, + "step": 808 + }, + { + "epoch": 1.170767004341534, + "grad_norm": 1.454323172569275, + "learning_rate": 2.361222847948512e-06, + "loss": 0.9293, + "step": 809 + }, + { + "epoch": 1.1722141823444283, + "grad_norm": 1.5222820043563843, + "learning_rate": 2.3572003218020918e-06, + "loss": 0.966, + "step": 810 + }, + { + "epoch": 1.1722141823444283, + "eval_loss": 1.0610514879226685, + "eval_runtime": 23.7519, + "eval_samples_per_second": 42.102, + "eval_steps_per_second": 2.652, + "step": 810 + }, + { + "epoch": 1.1736613603473227, + "grad_norm": 1.5027915239334106, + "learning_rate": 2.353177795655672e-06, + "loss": 0.9433, + "step": 811 + }, + { + "epoch": 1.1751085383502171, + "grad_norm": 1.50215744972229, + "learning_rate": 2.349155269509252e-06, + "loss": 0.9462, + "step": 812 + }, + { + "epoch": 1.1765557163531115, + "grad_norm": 1.5002515316009521, + "learning_rate": 2.345132743362832e-06, + "loss": 0.9633, + "step": 813 + }, + { + "epoch": 1.1780028943560057, + "grad_norm": 1.5375967025756836, + "learning_rate": 2.3411102172164123e-06, + "loss": 0.9701, + "step": 814 + }, + { + "epoch": 1.1794500723589, + "grad_norm": 1.504026174545288, + "learning_rate": 2.337087691069992e-06, + "loss": 0.9672, + "step": 815 + }, + { + "epoch": 1.1808972503617945, + "grad_norm": 1.4950342178344727, + "learning_rate": 2.333065164923572e-06, + "loss": 0.9632, + "step": 816 + }, + { + "epoch": 1.182344428364689, + "grad_norm": 1.4483928680419922, + "learning_rate": 2.3290426387771522e-06, + "loss": 0.9595, + "step": 817 + }, + { + "epoch": 1.1837916063675833, + "grad_norm": 1.3476918935775757, + "learning_rate": 2.3250201126307324e-06, + "loss": 0.9451, + "step": 818 + }, + { + "epoch": 1.1852387843704775, + "grad_norm": 1.449734091758728, + "learning_rate": 2.3209975864843125e-06, + "loss": 0.9692, + "step": 819 + }, + { + "epoch": 1.1866859623733719, + "grad_norm": 1.4646224975585938, + "learning_rate": 2.316975060337892e-06, + "loss": 0.9181, + "step": 820 + }, + { + "epoch": 1.1866859623733719, + "eval_loss": 1.0595619678497314, + "eval_runtime": 23.7668, + "eval_samples_per_second": 42.075, + "eval_steps_per_second": 2.651, + "step": 820 + }, + { + "epoch": 1.1881331403762663, + "grad_norm": 1.3570210933685303, + "learning_rate": 2.3129525341914723e-06, + "loss": 0.9664, + "step": 821 + }, + { + "epoch": 1.1895803183791607, + "grad_norm": 1.5159149169921875, + "learning_rate": 2.3089300080450525e-06, + "loss": 0.9694, + "step": 822 + }, + { + "epoch": 1.191027496382055, + "grad_norm": 1.529811978340149, + "learning_rate": 2.3049074818986326e-06, + "loss": 0.9568, + "step": 823 + }, + { + "epoch": 1.1924746743849495, + "grad_norm": 1.4050387144088745, + "learning_rate": 2.3008849557522127e-06, + "loss": 0.9398, + "step": 824 + }, + { + "epoch": 1.1939218523878437, + "grad_norm": 1.459504246711731, + "learning_rate": 2.2968624296057924e-06, + "loss": 0.976, + "step": 825 + }, + { + "epoch": 1.195369030390738, + "grad_norm": 1.413389801979065, + "learning_rate": 2.2928399034593725e-06, + "loss": 0.961, + "step": 826 + }, + { + "epoch": 1.1968162083936325, + "grad_norm": 1.4313169717788696, + "learning_rate": 2.2888173773129527e-06, + "loss": 0.9758, + "step": 827 + }, + { + "epoch": 1.1982633863965269, + "grad_norm": 1.4943634271621704, + "learning_rate": 2.284794851166533e-06, + "loss": 0.9456, + "step": 828 + }, + { + "epoch": 1.199710564399421, + "grad_norm": 1.5241035223007202, + "learning_rate": 2.280772325020113e-06, + "loss": 0.9814, + "step": 829 + }, + { + "epoch": 1.2011577424023154, + "grad_norm": 1.5724812746047974, + "learning_rate": 2.2767497988736926e-06, + "loss": 0.945, + "step": 830 + }, + { + "epoch": 1.2011577424023154, + "eval_loss": 1.0602580308914185, + "eval_runtime": 23.7721, + "eval_samples_per_second": 42.066, + "eval_steps_per_second": 2.65, + "step": 830 + }, + { + "epoch": 1.2026049204052098, + "grad_norm": 1.5690464973449707, + "learning_rate": 2.2727272727272728e-06, + "loss": 0.9241, + "step": 831 + }, + { + "epoch": 1.2040520984081042, + "grad_norm": 1.4767553806304932, + "learning_rate": 2.268704746580853e-06, + "loss": 0.9488, + "step": 832 + }, + { + "epoch": 1.2054992764109986, + "grad_norm": 1.4332164525985718, + "learning_rate": 2.264682220434433e-06, + "loss": 0.9357, + "step": 833 + }, + { + "epoch": 1.206946454413893, + "grad_norm": 1.523019552230835, + "learning_rate": 2.260659694288013e-06, + "loss": 0.9407, + "step": 834 + }, + { + "epoch": 1.2083936324167872, + "grad_norm": 1.6495591402053833, + "learning_rate": 2.256637168141593e-06, + "loss": 0.9178, + "step": 835 + }, + { + "epoch": 1.2098408104196816, + "grad_norm": 1.47744882106781, + "learning_rate": 2.252614641995173e-06, + "loss": 0.9384, + "step": 836 + }, + { + "epoch": 1.211287988422576, + "grad_norm": 1.3810203075408936, + "learning_rate": 2.248592115848753e-06, + "loss": 0.9457, + "step": 837 + }, + { + "epoch": 1.2127351664254704, + "grad_norm": 1.543784737586975, + "learning_rate": 2.2445695897023332e-06, + "loss": 0.9466, + "step": 838 + }, + { + "epoch": 1.2141823444283646, + "grad_norm": 1.4719823598861694, + "learning_rate": 2.2405470635559134e-06, + "loss": 0.9263, + "step": 839 + }, + { + "epoch": 1.215629522431259, + "grad_norm": 1.5356696844100952, + "learning_rate": 2.236524537409493e-06, + "loss": 0.9633, + "step": 840 + }, + { + "epoch": 1.215629522431259, + "eval_loss": 1.058498740196228, + "eval_runtime": 23.8479, + "eval_samples_per_second": 41.932, + "eval_steps_per_second": 2.642, + "step": 840 + }, + { + "epoch": 1.2170767004341534, + "grad_norm": 1.4253358840942383, + "learning_rate": 2.232502011263073e-06, + "loss": 0.916, + "step": 841 + }, + { + "epoch": 1.2185238784370478, + "grad_norm": 1.4666118621826172, + "learning_rate": 2.2284794851166537e-06, + "loss": 0.9459, + "step": 842 + }, + { + "epoch": 1.2199710564399422, + "grad_norm": 1.5525188446044922, + "learning_rate": 2.2244569589702334e-06, + "loss": 0.9181, + "step": 843 + }, + { + "epoch": 1.2214182344428366, + "grad_norm": 1.4791418313980103, + "learning_rate": 2.2204344328238136e-06, + "loss": 0.9329, + "step": 844 + }, + { + "epoch": 1.2228654124457308, + "grad_norm": 1.4657142162322998, + "learning_rate": 2.2164119066773937e-06, + "loss": 0.9487, + "step": 845 + }, + { + "epoch": 1.2243125904486252, + "grad_norm": 1.5355684757232666, + "learning_rate": 2.212389380530974e-06, + "loss": 0.9626, + "step": 846 + }, + { + "epoch": 1.2257597684515196, + "grad_norm": 1.4917405843734741, + "learning_rate": 2.208366854384554e-06, + "loss": 0.9562, + "step": 847 + }, + { + "epoch": 1.227206946454414, + "grad_norm": 1.4700294733047485, + "learning_rate": 2.2043443282381337e-06, + "loss": 0.9816, + "step": 848 + }, + { + "epoch": 1.2286541244573081, + "grad_norm": 1.473539113998413, + "learning_rate": 2.2003218020917138e-06, + "loss": 0.9466, + "step": 849 + }, + { + "epoch": 1.2301013024602026, + "grad_norm": 1.5261290073394775, + "learning_rate": 2.196299275945294e-06, + "loss": 0.9458, + "step": 850 + }, + { + "epoch": 1.2301013024602026, + "eval_loss": 1.056043267250061, + "eval_runtime": 23.7701, + "eval_samples_per_second": 42.07, + "eval_steps_per_second": 2.65, + "step": 850 + }, + { + "epoch": 1.231548480463097, + "grad_norm": 1.4729208946228027, + "learning_rate": 2.192276749798874e-06, + "loss": 0.9607, + "step": 851 + }, + { + "epoch": 1.2329956584659914, + "grad_norm": 1.4478142261505127, + "learning_rate": 2.188254223652454e-06, + "loss": 0.949, + "step": 852 + }, + { + "epoch": 1.2344428364688858, + "grad_norm": 1.5517524480819702, + "learning_rate": 2.184231697506034e-06, + "loss": 0.9366, + "step": 853 + }, + { + "epoch": 1.2358900144717802, + "grad_norm": 1.509303092956543, + "learning_rate": 2.180209171359614e-06, + "loss": 0.938, + "step": 854 + }, + { + "epoch": 1.2373371924746743, + "grad_norm": 1.5616720914840698, + "learning_rate": 2.176186645213194e-06, + "loss": 0.9547, + "step": 855 + }, + { + "epoch": 1.2387843704775687, + "grad_norm": 1.5296682119369507, + "learning_rate": 2.1721641190667743e-06, + "loss": 0.9166, + "step": 856 + }, + { + "epoch": 1.2402315484804631, + "grad_norm": 1.5027031898498535, + "learning_rate": 2.1681415929203544e-06, + "loss": 0.9468, + "step": 857 + }, + { + "epoch": 1.2416787264833575, + "grad_norm": 1.4932961463928223, + "learning_rate": 2.164119066773934e-06, + "loss": 0.9523, + "step": 858 + }, + { + "epoch": 1.2431259044862517, + "grad_norm": 1.4184107780456543, + "learning_rate": 2.1600965406275142e-06, + "loss": 0.9378, + "step": 859 + }, + { + "epoch": 1.244573082489146, + "grad_norm": 1.4559129476547241, + "learning_rate": 2.1560740144810943e-06, + "loss": 0.9884, + "step": 860 + }, + { + "epoch": 1.244573082489146, + "eval_loss": 1.055425763130188, + "eval_runtime": 23.6687, + "eval_samples_per_second": 42.25, + "eval_steps_per_second": 2.662, + "step": 860 + }, + { + "epoch": 1.2460202604920405, + "grad_norm": 1.4457273483276367, + "learning_rate": 2.1520514883346745e-06, + "loss": 0.9293, + "step": 861 + }, + { + "epoch": 1.247467438494935, + "grad_norm": 1.3795247077941895, + "learning_rate": 2.1480289621882546e-06, + "loss": 0.945, + "step": 862 + }, + { + "epoch": 1.2489146164978293, + "grad_norm": 1.411369800567627, + "learning_rate": 2.1440064360418343e-06, + "loss": 0.9304, + "step": 863 + }, + { + "epoch": 1.2503617945007237, + "grad_norm": 1.4530861377716064, + "learning_rate": 2.1399839098954144e-06, + "loss": 0.9126, + "step": 864 + }, + { + "epoch": 1.251808972503618, + "grad_norm": 1.4359352588653564, + "learning_rate": 2.1359613837489946e-06, + "loss": 0.9436, + "step": 865 + }, + { + "epoch": 1.2532561505065123, + "grad_norm": 1.451204538345337, + "learning_rate": 2.1319388576025747e-06, + "loss": 0.9589, + "step": 866 + }, + { + "epoch": 1.2547033285094067, + "grad_norm": 1.5190671682357788, + "learning_rate": 2.127916331456155e-06, + "loss": 0.9587, + "step": 867 + }, + { + "epoch": 1.256150506512301, + "grad_norm": 1.4806249141693115, + "learning_rate": 2.1238938053097345e-06, + "loss": 0.9349, + "step": 868 + }, + { + "epoch": 1.2575976845151953, + "grad_norm": 1.4910272359848022, + "learning_rate": 2.1198712791633146e-06, + "loss": 0.9289, + "step": 869 + }, + { + "epoch": 1.2590448625180897, + "grad_norm": 1.4659475088119507, + "learning_rate": 2.1158487530168948e-06, + "loss": 0.9597, + "step": 870 + }, + { + "epoch": 1.2590448625180897, + "eval_loss": 1.0561730861663818, + "eval_runtime": 23.7128, + "eval_samples_per_second": 42.171, + "eval_steps_per_second": 2.657, + "step": 870 + }, + { + "epoch": 1.260492040520984, + "grad_norm": 1.4795215129852295, + "learning_rate": 2.111826226870475e-06, + "loss": 0.9451, + "step": 871 + }, + { + "epoch": 1.2619392185238785, + "grad_norm": 1.4478733539581299, + "learning_rate": 2.107803700724055e-06, + "loss": 0.943, + "step": 872 + }, + { + "epoch": 1.2633863965267729, + "grad_norm": 1.4435054063796997, + "learning_rate": 2.1037811745776347e-06, + "loss": 0.8988, + "step": 873 + }, + { + "epoch": 1.2648335745296673, + "grad_norm": 1.373901128768921, + "learning_rate": 2.099758648431215e-06, + "loss": 0.9442, + "step": 874 + }, + { + "epoch": 1.2662807525325614, + "grad_norm": 1.5020791292190552, + "learning_rate": 2.095736122284795e-06, + "loss": 0.9436, + "step": 875 + }, + { + "epoch": 1.2677279305354558, + "grad_norm": 1.4131574630737305, + "learning_rate": 2.091713596138375e-06, + "loss": 0.8942, + "step": 876 + }, + { + "epoch": 1.2691751085383502, + "grad_norm": 1.4769846200942993, + "learning_rate": 2.0876910699919552e-06, + "loss": 0.9404, + "step": 877 + }, + { + "epoch": 1.2706222865412446, + "grad_norm": 1.5109491348266602, + "learning_rate": 2.083668543845535e-06, + "loss": 0.953, + "step": 878 + }, + { + "epoch": 1.2720694645441388, + "grad_norm": 1.5477077960968018, + "learning_rate": 2.079646017699115e-06, + "loss": 0.9512, + "step": 879 + }, + { + "epoch": 1.2735166425470332, + "grad_norm": 1.5046113729476929, + "learning_rate": 2.075623491552695e-06, + "loss": 0.9485, + "step": 880 + }, + { + "epoch": 1.2735166425470332, + "eval_loss": 1.0562736988067627, + "eval_runtime": 23.792, + "eval_samples_per_second": 42.031, + "eval_steps_per_second": 2.648, + "step": 880 + }, + { + "epoch": 1.2749638205499276, + "grad_norm": 1.4490458965301514, + "learning_rate": 2.0716009654062753e-06, + "loss": 0.951, + "step": 881 + }, + { + "epoch": 1.276410998552822, + "grad_norm": 1.4025237560272217, + "learning_rate": 2.067578439259855e-06, + "loss": 0.959, + "step": 882 + }, + { + "epoch": 1.2778581765557164, + "grad_norm": 1.472821831703186, + "learning_rate": 2.063555913113435e-06, + "loss": 0.9217, + "step": 883 + }, + { + "epoch": 1.2793053545586108, + "grad_norm": 1.4784339666366577, + "learning_rate": 2.0595333869670153e-06, + "loss": 0.9436, + "step": 884 + }, + { + "epoch": 1.280752532561505, + "grad_norm": 1.424924612045288, + "learning_rate": 2.0555108608205954e-06, + "loss": 0.9507, + "step": 885 + }, + { + "epoch": 1.2821997105643994, + "grad_norm": 1.4141677618026733, + "learning_rate": 2.0514883346741755e-06, + "loss": 0.9368, + "step": 886 + }, + { + "epoch": 1.2836468885672938, + "grad_norm": 1.4544395208358765, + "learning_rate": 2.0474658085277557e-06, + "loss": 0.9389, + "step": 887 + }, + { + "epoch": 1.2850940665701882, + "grad_norm": 1.4714221954345703, + "learning_rate": 2.043443282381336e-06, + "loss": 0.9329, + "step": 888 + }, + { + "epoch": 1.2865412445730824, + "grad_norm": 1.535444736480713, + "learning_rate": 2.039420756234916e-06, + "loss": 0.956, + "step": 889 + }, + { + "epoch": 1.2879884225759768, + "grad_norm": 1.502983808517456, + "learning_rate": 2.035398230088496e-06, + "loss": 0.9405, + "step": 890 + }, + { + "epoch": 1.2879884225759768, + "eval_loss": 1.0558134317398071, + "eval_runtime": 23.7998, + "eval_samples_per_second": 42.017, + "eval_steps_per_second": 2.647, + "step": 890 + }, + { + "epoch": 1.2894356005788712, + "grad_norm": 1.4328001737594604, + "learning_rate": 2.0313757039420758e-06, + "loss": 0.9046, + "step": 891 + }, + { + "epoch": 1.2908827785817656, + "grad_norm": 1.5354102849960327, + "learning_rate": 2.027353177795656e-06, + "loss": 0.9437, + "step": 892 + }, + { + "epoch": 1.29232995658466, + "grad_norm": 1.5058958530426025, + "learning_rate": 2.023330651649236e-06, + "loss": 0.9563, + "step": 893 + }, + { + "epoch": 1.2937771345875544, + "grad_norm": 1.424705982208252, + "learning_rate": 2.019308125502816e-06, + "loss": 0.9489, + "step": 894 + }, + { + "epoch": 1.2952243125904486, + "grad_norm": 1.4443714618682861, + "learning_rate": 2.0152855993563963e-06, + "loss": 0.9364, + "step": 895 + }, + { + "epoch": 1.296671490593343, + "grad_norm": 1.546347737312317, + "learning_rate": 2.011263073209976e-06, + "loss": 0.9161, + "step": 896 + }, + { + "epoch": 1.2981186685962374, + "grad_norm": 1.4638381004333496, + "learning_rate": 2.007240547063556e-06, + "loss": 0.9413, + "step": 897 + }, + { + "epoch": 1.2995658465991318, + "grad_norm": 1.4480944871902466, + "learning_rate": 2.0032180209171362e-06, + "loss": 0.9487, + "step": 898 + }, + { + "epoch": 1.301013024602026, + "grad_norm": 1.5396231412887573, + "learning_rate": 1.9991954947707164e-06, + "loss": 0.9489, + "step": 899 + }, + { + "epoch": 1.3024602026049203, + "grad_norm": 1.4865225553512573, + "learning_rate": 1.995172968624296e-06, + "loss": 0.9403, + "step": 900 + }, + { + "epoch": 1.3024602026049203, + "eval_loss": 1.0528783798217773, + "eval_runtime": 23.783, + "eval_samples_per_second": 42.047, + "eval_steps_per_second": 2.649, + "step": 900 + }, + { + "epoch": 1.3039073806078147, + "grad_norm": 1.4845038652420044, + "learning_rate": 1.991150442477876e-06, + "loss": 0.9605, + "step": 901 + }, + { + "epoch": 1.3053545586107091, + "grad_norm": 1.4714915752410889, + "learning_rate": 1.9871279163314563e-06, + "loss": 0.9475, + "step": 902 + }, + { + "epoch": 1.3068017366136035, + "grad_norm": 1.5091931819915771, + "learning_rate": 1.9831053901850364e-06, + "loss": 0.9251, + "step": 903 + }, + { + "epoch": 1.308248914616498, + "grad_norm": 1.4843708276748657, + "learning_rate": 1.9790828640386166e-06, + "loss": 0.9297, + "step": 904 + }, + { + "epoch": 1.3096960926193921, + "grad_norm": 1.531639814376831, + "learning_rate": 1.9750603378921963e-06, + "loss": 0.9712, + "step": 905 + }, + { + "epoch": 1.3111432706222865, + "grad_norm": 1.5620850324630737, + "learning_rate": 1.9710378117457764e-06, + "loss": 0.9675, + "step": 906 + }, + { + "epoch": 1.312590448625181, + "grad_norm": 1.4942399263381958, + "learning_rate": 1.9670152855993565e-06, + "loss": 0.9548, + "step": 907 + }, + { + "epoch": 1.3140376266280753, + "grad_norm": 1.552949070930481, + "learning_rate": 1.9629927594529367e-06, + "loss": 0.9452, + "step": 908 + }, + { + "epoch": 1.3154848046309695, + "grad_norm": 1.401281476020813, + "learning_rate": 1.9589702333065168e-06, + "loss": 0.9347, + "step": 909 + }, + { + "epoch": 1.316931982633864, + "grad_norm": 1.4023936986923218, + "learning_rate": 1.9549477071600965e-06, + "loss": 0.9041, + "step": 910 + }, + { + "epoch": 1.316931982633864, + "eval_loss": 1.051599144935608, + "eval_runtime": 23.7155, + "eval_samples_per_second": 42.167, + "eval_steps_per_second": 2.656, + "step": 910 + }, + { + "epoch": 1.3183791606367583, + "grad_norm": 1.5213221311569214, + "learning_rate": 1.9509251810136766e-06, + "loss": 0.967, + "step": 911 + }, + { + "epoch": 1.3198263386396527, + "grad_norm": 1.5014933347702026, + "learning_rate": 1.9469026548672567e-06, + "loss": 0.9228, + "step": 912 + }, + { + "epoch": 1.321273516642547, + "grad_norm": 1.4590164422988892, + "learning_rate": 1.942880128720837e-06, + "loss": 0.9441, + "step": 913 + }, + { + "epoch": 1.3227206946454415, + "grad_norm": 1.4393606185913086, + "learning_rate": 1.938857602574417e-06, + "loss": 0.9725, + "step": 914 + }, + { + "epoch": 1.3241678726483357, + "grad_norm": 1.4212093353271484, + "learning_rate": 1.9348350764279967e-06, + "loss": 0.9573, + "step": 915 + }, + { + "epoch": 1.32561505065123, + "grad_norm": 1.5041481256484985, + "learning_rate": 1.930812550281577e-06, + "loss": 0.9537, + "step": 916 + }, + { + "epoch": 1.3270622286541245, + "grad_norm": 1.4008187055587769, + "learning_rate": 1.926790024135157e-06, + "loss": 0.9266, + "step": 917 + }, + { + "epoch": 1.3285094066570189, + "grad_norm": 1.4834818840026855, + "learning_rate": 1.922767497988737e-06, + "loss": 0.9199, + "step": 918 + }, + { + "epoch": 1.329956584659913, + "grad_norm": 1.508237600326538, + "learning_rate": 1.9187449718423172e-06, + "loss": 0.9573, + "step": 919 + }, + { + "epoch": 1.3314037626628075, + "grad_norm": 1.4868502616882324, + "learning_rate": 1.914722445695897e-06, + "loss": 0.9654, + "step": 920 + }, + { + "epoch": 1.3314037626628075, + "eval_loss": 1.0539422035217285, + "eval_runtime": 23.758, + "eval_samples_per_second": 42.091, + "eval_steps_per_second": 2.652, + "step": 920 + }, + { + "epoch": 1.3328509406657019, + "grad_norm": 1.4414838552474976, + "learning_rate": 1.910699919549477e-06, + "loss": 0.9713, + "step": 921 + }, + { + "epoch": 1.3342981186685963, + "grad_norm": 1.499587059020996, + "learning_rate": 1.9066773934030572e-06, + "loss": 0.959, + "step": 922 + }, + { + "epoch": 1.3357452966714907, + "grad_norm": 1.5229673385620117, + "learning_rate": 1.9026548672566373e-06, + "loss": 0.9492, + "step": 923 + }, + { + "epoch": 1.337192474674385, + "grad_norm": 1.5483434200286865, + "learning_rate": 1.8986323411102172e-06, + "loss": 0.9346, + "step": 924 + }, + { + "epoch": 1.3386396526772792, + "grad_norm": 1.4725099802017212, + "learning_rate": 1.8946098149637973e-06, + "loss": 0.9296, + "step": 925 + }, + { + "epoch": 1.3400868306801736, + "grad_norm": 1.5383471250534058, + "learning_rate": 1.8905872888173773e-06, + "loss": 0.9317, + "step": 926 + }, + { + "epoch": 1.341534008683068, + "grad_norm": 1.4471979141235352, + "learning_rate": 1.8865647626709574e-06, + "loss": 0.9326, + "step": 927 + }, + { + "epoch": 1.3429811866859624, + "grad_norm": 1.4730262756347656, + "learning_rate": 1.8825422365245375e-06, + "loss": 0.9149, + "step": 928 + }, + { + "epoch": 1.3444283646888566, + "grad_norm": 1.4816479682922363, + "learning_rate": 1.8785197103781174e-06, + "loss": 0.9234, + "step": 929 + }, + { + "epoch": 1.345875542691751, + "grad_norm": 1.5070077180862427, + "learning_rate": 1.8744971842316978e-06, + "loss": 0.9367, + "step": 930 + }, + { + "epoch": 1.345875542691751, + "eval_loss": 1.0537391901016235, + "eval_runtime": 23.8135, + "eval_samples_per_second": 41.993, + "eval_steps_per_second": 2.646, + "step": 930 + }, + { + "epoch": 1.3473227206946454, + "grad_norm": 1.4584532976150513, + "learning_rate": 1.870474658085278e-06, + "loss": 0.9253, + "step": 931 + }, + { + "epoch": 1.3487698986975398, + "grad_norm": 1.5026274919509888, + "learning_rate": 1.8664521319388578e-06, + "loss": 0.9376, + "step": 932 + }, + { + "epoch": 1.3502170767004342, + "grad_norm": 1.5126888751983643, + "learning_rate": 1.862429605792438e-06, + "loss": 0.9201, + "step": 933 + }, + { + "epoch": 1.3516642547033286, + "grad_norm": 1.4758694171905518, + "learning_rate": 1.8584070796460179e-06, + "loss": 0.929, + "step": 934 + }, + { + "epoch": 1.3531114327062228, + "grad_norm": 1.6372520923614502, + "learning_rate": 1.854384553499598e-06, + "loss": 0.97, + "step": 935 + }, + { + "epoch": 1.3545586107091172, + "grad_norm": 1.4822540283203125, + "learning_rate": 1.8503620273531781e-06, + "loss": 0.96, + "step": 936 + }, + { + "epoch": 1.3560057887120116, + "grad_norm": 1.4904818534851074, + "learning_rate": 1.846339501206758e-06, + "loss": 0.9421, + "step": 937 + }, + { + "epoch": 1.357452966714906, + "grad_norm": 1.457805871963501, + "learning_rate": 1.8423169750603382e-06, + "loss": 0.9124, + "step": 938 + }, + { + "epoch": 1.3589001447178002, + "grad_norm": 1.464258074760437, + "learning_rate": 1.838294448913918e-06, + "loss": 0.9328, + "step": 939 + }, + { + "epoch": 1.3603473227206946, + "grad_norm": 1.462436556816101, + "learning_rate": 1.8342719227674982e-06, + "loss": 0.9319, + "step": 940 + }, + { + "epoch": 1.3603473227206946, + "eval_loss": 1.0547114610671997, + "eval_runtime": 23.7821, + "eval_samples_per_second": 42.048, + "eval_steps_per_second": 2.649, + "step": 940 + }, + { + "epoch": 1.361794500723589, + "grad_norm": 1.472638487815857, + "learning_rate": 1.8302493966210783e-06, + "loss": 0.9468, + "step": 941 + }, + { + "epoch": 1.3632416787264834, + "grad_norm": 1.422621726989746, + "learning_rate": 1.8262268704746582e-06, + "loss": 0.9273, + "step": 942 + }, + { + "epoch": 1.3646888567293778, + "grad_norm": 1.4565759897232056, + "learning_rate": 1.8222043443282384e-06, + "loss": 0.9355, + "step": 943 + }, + { + "epoch": 1.3661360347322722, + "grad_norm": 1.4601001739501953, + "learning_rate": 1.8181818181818183e-06, + "loss": 0.9348, + "step": 944 + }, + { + "epoch": 1.3675832127351664, + "grad_norm": 1.4513534307479858, + "learning_rate": 1.8141592920353984e-06, + "loss": 0.9244, + "step": 945 + }, + { + "epoch": 1.3690303907380608, + "grad_norm": 1.4226782321929932, + "learning_rate": 1.8101367658889785e-06, + "loss": 0.9425, + "step": 946 + }, + { + "epoch": 1.3704775687409552, + "grad_norm": 1.4283246994018555, + "learning_rate": 1.8061142397425585e-06, + "loss": 0.9507, + "step": 947 + }, + { + "epoch": 1.3719247467438496, + "grad_norm": 1.4652060270309448, + "learning_rate": 1.8020917135961386e-06, + "loss": 0.9685, + "step": 948 + }, + { + "epoch": 1.3733719247467437, + "grad_norm": 1.455932378768921, + "learning_rate": 1.7980691874497185e-06, + "loss": 0.9415, + "step": 949 + }, + { + "epoch": 1.3748191027496381, + "grad_norm": 1.4629865884780884, + "learning_rate": 1.7940466613032986e-06, + "loss": 0.8957, + "step": 950 + }, + { + "epoch": 1.3748191027496381, + "eval_loss": 1.0515986680984497, + "eval_runtime": 23.7139, + "eval_samples_per_second": 42.169, + "eval_steps_per_second": 2.657, + "step": 950 + }, + { + "epoch": 1.3762662807525325, + "grad_norm": 1.5208227634429932, + "learning_rate": 1.7900241351568788e-06, + "loss": 0.8962, + "step": 951 + }, + { + "epoch": 1.377713458755427, + "grad_norm": 1.4352270364761353, + "learning_rate": 1.7860016090104587e-06, + "loss": 0.9077, + "step": 952 + }, + { + "epoch": 1.3791606367583213, + "grad_norm": 1.4977892637252808, + "learning_rate": 1.7819790828640388e-06, + "loss": 0.9479, + "step": 953 + }, + { + "epoch": 1.3806078147612157, + "grad_norm": 1.3771772384643555, + "learning_rate": 1.7779565567176187e-06, + "loss": 0.9073, + "step": 954 + }, + { + "epoch": 1.38205499276411, + "grad_norm": 1.4708818197250366, + "learning_rate": 1.7739340305711988e-06, + "loss": 0.9124, + "step": 955 + }, + { + "epoch": 1.3835021707670043, + "grad_norm": 1.442540168762207, + "learning_rate": 1.769911504424779e-06, + "loss": 0.9279, + "step": 956 + }, + { + "epoch": 1.3849493487698987, + "grad_norm": 1.4638023376464844, + "learning_rate": 1.7658889782783589e-06, + "loss": 0.9112, + "step": 957 + }, + { + "epoch": 1.3863965267727931, + "grad_norm": 1.4360297918319702, + "learning_rate": 1.761866452131939e-06, + "loss": 0.9411, + "step": 958 + }, + { + "epoch": 1.3878437047756873, + "grad_norm": 1.4222283363342285, + "learning_rate": 1.757843925985519e-06, + "loss": 0.9293, + "step": 959 + }, + { + "epoch": 1.3892908827785817, + "grad_norm": 1.4751322269439697, + "learning_rate": 1.753821399839099e-06, + "loss": 0.9618, + "step": 960 + }, + { + "epoch": 1.3892908827785817, + "eval_loss": 1.0494076013565063, + "eval_runtime": 23.7202, + "eval_samples_per_second": 42.158, + "eval_steps_per_second": 2.656, + "step": 960 + }, + { + "epoch": 1.390738060781476, + "grad_norm": 1.474975824356079, + "learning_rate": 1.7497988736926792e-06, + "loss": 0.9625, + "step": 961 + }, + { + "epoch": 1.3921852387843705, + "grad_norm": 1.4462794065475464, + "learning_rate": 1.7457763475462591e-06, + "loss": 0.926, + "step": 962 + }, + { + "epoch": 1.393632416787265, + "grad_norm": 1.5485862493515015, + "learning_rate": 1.7417538213998392e-06, + "loss": 0.9514, + "step": 963 + }, + { + "epoch": 1.3950795947901593, + "grad_norm": 1.4370174407958984, + "learning_rate": 1.7377312952534192e-06, + "loss": 0.9686, + "step": 964 + }, + { + "epoch": 1.3965267727930535, + "grad_norm": 1.5217152833938599, + "learning_rate": 1.7337087691069993e-06, + "loss": 0.946, + "step": 965 + }, + { + "epoch": 1.3979739507959479, + "grad_norm": 1.5040043592453003, + "learning_rate": 1.7296862429605792e-06, + "loss": 0.9476, + "step": 966 + }, + { + "epoch": 1.3994211287988423, + "grad_norm": 1.5391035079956055, + "learning_rate": 1.7256637168141593e-06, + "loss": 0.9328, + "step": 967 + }, + { + "epoch": 1.4008683068017367, + "grad_norm": 1.437731146812439, + "learning_rate": 1.7216411906677395e-06, + "loss": 0.8839, + "step": 968 + }, + { + "epoch": 1.4023154848046309, + "grad_norm": 1.4600027799606323, + "learning_rate": 1.7176186645213194e-06, + "loss": 0.9171, + "step": 969 + }, + { + "epoch": 1.4037626628075253, + "grad_norm": 1.5679881572723389, + "learning_rate": 1.7135961383748995e-06, + "loss": 0.964, + "step": 970 + }, + { + "epoch": 1.4037626628075253, + "eval_loss": 1.0472441911697388, + "eval_runtime": 23.7366, + "eval_samples_per_second": 42.129, + "eval_steps_per_second": 2.654, + "step": 970 + }, + { + "epoch": 1.4052098408104197, + "grad_norm": 1.4671127796173096, + "learning_rate": 1.7095736122284794e-06, + "loss": 0.9307, + "step": 971 + }, + { + "epoch": 1.406657018813314, + "grad_norm": 1.3909881114959717, + "learning_rate": 1.7055510860820595e-06, + "loss": 0.9006, + "step": 972 + }, + { + "epoch": 1.4081041968162085, + "grad_norm": 1.4965221881866455, + "learning_rate": 1.7015285599356397e-06, + "loss": 0.9395, + "step": 973 + }, + { + "epoch": 1.4095513748191029, + "grad_norm": 1.5574603080749512, + "learning_rate": 1.6975060337892196e-06, + "loss": 0.9815, + "step": 974 + }, + { + "epoch": 1.410998552821997, + "grad_norm": 1.4614620208740234, + "learning_rate": 1.6934835076428e-06, + "loss": 0.9261, + "step": 975 + }, + { + "epoch": 1.4124457308248914, + "grad_norm": 1.5711225271224976, + "learning_rate": 1.68946098149638e-06, + "loss": 0.9594, + "step": 976 + }, + { + "epoch": 1.4138929088277858, + "grad_norm": 1.4597771167755127, + "learning_rate": 1.68543845534996e-06, + "loss": 0.9442, + "step": 977 + }, + { + "epoch": 1.4153400868306802, + "grad_norm": 1.4819037914276123, + "learning_rate": 1.68141592920354e-06, + "loss": 0.9521, + "step": 978 + }, + { + "epoch": 1.4167872648335744, + "grad_norm": 1.4771568775177002, + "learning_rate": 1.6773934030571202e-06, + "loss": 0.9236, + "step": 979 + }, + { + "epoch": 1.4182344428364688, + "grad_norm": 1.4692391157150269, + "learning_rate": 1.6733708769107001e-06, + "loss": 0.9711, + "step": 980 + }, + { + "epoch": 1.4182344428364688, + "eval_loss": 1.0456666946411133, + "eval_runtime": 23.7426, + "eval_samples_per_second": 42.118, + "eval_steps_per_second": 2.653, + "step": 980 + }, + { + "epoch": 1.4196816208393632, + "grad_norm": 1.5555534362792969, + "learning_rate": 1.6693483507642803e-06, + "loss": 0.9022, + "step": 981 + }, + { + "epoch": 1.4211287988422576, + "grad_norm": 1.4623538255691528, + "learning_rate": 1.6653258246178602e-06, + "loss": 0.9193, + "step": 982 + }, + { + "epoch": 1.422575976845152, + "grad_norm": 1.4388707876205444, + "learning_rate": 1.6613032984714403e-06, + "loss": 0.9085, + "step": 983 + }, + { + "epoch": 1.4240231548480464, + "grad_norm": 1.6197476387023926, + "learning_rate": 1.6572807723250202e-06, + "loss": 0.9633, + "step": 984 + }, + { + "epoch": 1.4254703328509406, + "grad_norm": 1.5036537647247314, + "learning_rate": 1.6532582461786004e-06, + "loss": 0.9165, + "step": 985 + }, + { + "epoch": 1.426917510853835, + "grad_norm": 1.5422682762145996, + "learning_rate": 1.6492357200321805e-06, + "loss": 0.9441, + "step": 986 + }, + { + "epoch": 1.4283646888567294, + "grad_norm": 1.5538753271102905, + "learning_rate": 1.6452131938857604e-06, + "loss": 0.9391, + "step": 987 + }, + { + "epoch": 1.4298118668596238, + "grad_norm": 1.532204508781433, + "learning_rate": 1.6411906677393405e-06, + "loss": 0.941, + "step": 988 + }, + { + "epoch": 1.431259044862518, + "grad_norm": 1.5170310735702515, + "learning_rate": 1.6371681415929204e-06, + "loss": 0.9241, + "step": 989 + }, + { + "epoch": 1.4327062228654124, + "grad_norm": 1.5310797691345215, + "learning_rate": 1.6331456154465006e-06, + "loss": 0.9815, + "step": 990 + }, + { + "epoch": 1.4327062228654124, + "eval_loss": 1.0488560199737549, + "eval_runtime": 23.7796, + "eval_samples_per_second": 42.053, + "eval_steps_per_second": 2.649, + "step": 990 + }, + { + "epoch": 1.4341534008683068, + "grad_norm": 1.5136878490447998, + "learning_rate": 1.6291230893000807e-06, + "loss": 0.9233, + "step": 991 + }, + { + "epoch": 1.4356005788712012, + "grad_norm": 1.5479440689086914, + "learning_rate": 1.6251005631536606e-06, + "loss": 0.9372, + "step": 992 + }, + { + "epoch": 1.4370477568740956, + "grad_norm": 1.4571396112442017, + "learning_rate": 1.6210780370072407e-06, + "loss": 0.9383, + "step": 993 + }, + { + "epoch": 1.43849493487699, + "grad_norm": 1.4667364358901978, + "learning_rate": 1.6170555108608207e-06, + "loss": 0.9527, + "step": 994 + }, + { + "epoch": 1.4399421128798842, + "grad_norm": 1.5052558183670044, + "learning_rate": 1.6130329847144008e-06, + "loss": 0.9565, + "step": 995 + }, + { + "epoch": 1.4413892908827786, + "grad_norm": 1.483472228050232, + "learning_rate": 1.609010458567981e-06, + "loss": 0.9673, + "step": 996 + }, + { + "epoch": 1.442836468885673, + "grad_norm": 1.4150869846343994, + "learning_rate": 1.6049879324215608e-06, + "loss": 0.9134, + "step": 997 + }, + { + "epoch": 1.4442836468885674, + "grad_norm": 1.514204978942871, + "learning_rate": 1.600965406275141e-06, + "loss": 0.8943, + "step": 998 + }, + { + "epoch": 1.4457308248914615, + "grad_norm": 1.4856035709381104, + "learning_rate": 1.5969428801287209e-06, + "loss": 0.9382, + "step": 999 + }, + { + "epoch": 1.447178002894356, + "grad_norm": 1.459563136100769, + "learning_rate": 1.592920353982301e-06, + "loss": 0.9274, + "step": 1000 + }, + { + "epoch": 1.447178002894356, + "eval_loss": 1.0501357316970825, + "eval_runtime": 23.7768, + "eval_samples_per_second": 42.058, + "eval_steps_per_second": 2.65, + "step": 1000 + }, + { + "epoch": 1.4486251808972503, + "grad_norm": 1.444888710975647, + "learning_rate": 1.5888978278358811e-06, + "loss": 0.8945, + "step": 1001 + }, + { + "epoch": 1.4500723589001447, + "grad_norm": 1.5484874248504639, + "learning_rate": 1.584875301689461e-06, + "loss": 0.9355, + "step": 1002 + }, + { + "epoch": 1.4515195369030391, + "grad_norm": 1.6098073720932007, + "learning_rate": 1.5808527755430412e-06, + "loss": 0.9213, + "step": 1003 + }, + { + "epoch": 1.4529667149059335, + "grad_norm": 1.4799352884292603, + "learning_rate": 1.576830249396621e-06, + "loss": 0.9797, + "step": 1004 + }, + { + "epoch": 1.4544138929088277, + "grad_norm": 1.4707530736923218, + "learning_rate": 1.5728077232502012e-06, + "loss": 0.9232, + "step": 1005 + }, + { + "epoch": 1.4558610709117221, + "grad_norm": 1.4996123313903809, + "learning_rate": 1.5687851971037813e-06, + "loss": 0.9472, + "step": 1006 + }, + { + "epoch": 1.4573082489146165, + "grad_norm": 1.509531021118164, + "learning_rate": 1.5647626709573613e-06, + "loss": 0.9023, + "step": 1007 + }, + { + "epoch": 1.458755426917511, + "grad_norm": 1.4184415340423584, + "learning_rate": 1.5607401448109414e-06, + "loss": 0.96, + "step": 1008 + }, + { + "epoch": 1.460202604920405, + "grad_norm": 1.4569298028945923, + "learning_rate": 1.5567176186645213e-06, + "loss": 0.9047, + "step": 1009 + }, + { + "epoch": 1.4616497829232995, + "grad_norm": 1.4666918516159058, + "learning_rate": 1.5526950925181014e-06, + "loss": 0.9333, + "step": 1010 + }, + { + "epoch": 1.4616497829232995, + "eval_loss": 1.0488444566726685, + "eval_runtime": 23.722, + "eval_samples_per_second": 42.155, + "eval_steps_per_second": 2.656, + "step": 1010 + }, + { + "epoch": 1.463096960926194, + "grad_norm": 1.427756667137146, + "learning_rate": 1.5486725663716816e-06, + "loss": 0.9459, + "step": 1011 + }, + { + "epoch": 1.4645441389290883, + "grad_norm": 1.4592232704162598, + "learning_rate": 1.5446500402252615e-06, + "loss": 0.9212, + "step": 1012 + }, + { + "epoch": 1.4659913169319827, + "grad_norm": 1.5486156940460205, + "learning_rate": 1.5406275140788416e-06, + "loss": 0.9442, + "step": 1013 + }, + { + "epoch": 1.467438494934877, + "grad_norm": 1.4669780731201172, + "learning_rate": 1.5366049879324215e-06, + "loss": 0.9193, + "step": 1014 + }, + { + "epoch": 1.4688856729377713, + "grad_norm": 1.4887443780899048, + "learning_rate": 1.5325824617860016e-06, + "loss": 0.9317, + "step": 1015 + }, + { + "epoch": 1.4703328509406657, + "grad_norm": 1.5064948797225952, + "learning_rate": 1.5285599356395816e-06, + "loss": 0.9637, + "step": 1016 + }, + { + "epoch": 1.47178002894356, + "grad_norm": 1.4491357803344727, + "learning_rate": 1.5245374094931617e-06, + "loss": 0.9618, + "step": 1017 + }, + { + "epoch": 1.4732272069464545, + "grad_norm": 1.4527958631515503, + "learning_rate": 1.5205148833467418e-06, + "loss": 0.9069, + "step": 1018 + }, + { + "epoch": 1.4746743849493487, + "grad_norm": 1.5057029724121094, + "learning_rate": 1.5164923572003221e-06, + "loss": 0.8924, + "step": 1019 + }, + { + "epoch": 1.476121562952243, + "grad_norm": 1.445718765258789, + "learning_rate": 1.512469831053902e-06, + "loss": 0.9905, + "step": 1020 + }, + { + "epoch": 1.476121562952243, + "eval_loss": 1.048025369644165, + "eval_runtime": 23.7262, + "eval_samples_per_second": 42.148, + "eval_steps_per_second": 2.655, + "step": 1020 + }, + { + "epoch": 1.4775687409551375, + "grad_norm": 1.5055482387542725, + "learning_rate": 1.5084473049074822e-06, + "loss": 0.9328, + "step": 1021 + }, + { + "epoch": 1.4790159189580319, + "grad_norm": 1.5045979022979736, + "learning_rate": 1.5044247787610621e-06, + "loss": 0.9513, + "step": 1022 + }, + { + "epoch": 1.4804630969609263, + "grad_norm": 1.4663984775543213, + "learning_rate": 1.5004022526146422e-06, + "loss": 0.9271, + "step": 1023 + }, + { + "epoch": 1.4819102749638207, + "grad_norm": 1.5243443250656128, + "learning_rate": 1.4963797264682224e-06, + "loss": 0.98, + "step": 1024 + }, + { + "epoch": 1.4833574529667148, + "grad_norm": 1.4335856437683105, + "learning_rate": 1.4923572003218023e-06, + "loss": 0.9186, + "step": 1025 + }, + { + "epoch": 1.4848046309696092, + "grad_norm": 1.4535353183746338, + "learning_rate": 1.4883346741753824e-06, + "loss": 0.9304, + "step": 1026 + }, + { + "epoch": 1.4862518089725036, + "grad_norm": 1.4894002676010132, + "learning_rate": 1.4843121480289623e-06, + "loss": 0.9403, + "step": 1027 + }, + { + "epoch": 1.487698986975398, + "grad_norm": 1.4735504388809204, + "learning_rate": 1.4802896218825425e-06, + "loss": 0.9307, + "step": 1028 + }, + { + "epoch": 1.4891461649782922, + "grad_norm": 1.5178442001342773, + "learning_rate": 1.4762670957361226e-06, + "loss": 0.9166, + "step": 1029 + }, + { + "epoch": 1.4905933429811866, + "grad_norm": 1.5546519756317139, + "learning_rate": 1.4722445695897025e-06, + "loss": 0.9226, + "step": 1030 + }, + { + "epoch": 1.4905933429811866, + "eval_loss": 1.0466636419296265, + "eval_runtime": 23.7993, + "eval_samples_per_second": 42.018, + "eval_steps_per_second": 2.647, + "step": 1030 + }, + { + "epoch": 1.492040520984081, + "grad_norm": 1.4860820770263672, + "learning_rate": 1.4682220434432826e-06, + "loss": 0.9686, + "step": 1031 + }, + { + "epoch": 1.4934876989869754, + "grad_norm": 1.4392701387405396, + "learning_rate": 1.4641995172968625e-06, + "loss": 0.9266, + "step": 1032 + }, + { + "epoch": 1.4949348769898698, + "grad_norm": 1.467860460281372, + "learning_rate": 1.4601769911504427e-06, + "loss": 0.9501, + "step": 1033 + }, + { + "epoch": 1.4963820549927642, + "grad_norm": 1.499380111694336, + "learning_rate": 1.4561544650040226e-06, + "loss": 0.9178, + "step": 1034 + }, + { + "epoch": 1.4978292329956584, + "grad_norm": 1.434569239616394, + "learning_rate": 1.4521319388576027e-06, + "loss": 0.9448, + "step": 1035 + }, + { + "epoch": 1.4992764109985528, + "grad_norm": 1.4242448806762695, + "learning_rate": 1.4481094127111828e-06, + "loss": 0.9099, + "step": 1036 + }, + { + "epoch": 1.5007235890014472, + "grad_norm": 1.4757294654846191, + "learning_rate": 1.4440868865647628e-06, + "loss": 0.9421, + "step": 1037 + }, + { + "epoch": 1.5021707670043414, + "grad_norm": 1.4763214588165283, + "learning_rate": 1.4400643604183429e-06, + "loss": 0.9063, + "step": 1038 + }, + { + "epoch": 1.5036179450072358, + "grad_norm": 1.4763214588165283, + "learning_rate": 1.4400643604183429e-06, + "loss": 0.9203, + "step": 1039 + }, + { + "epoch": 1.5050651230101302, + "grad_norm": 1.5087497234344482, + "learning_rate": 1.4360418342719228e-06, + "loss": 0.9366, + "step": 1040 + }, + { + "epoch": 1.5050651230101302, + "eval_loss": 1.0445455312728882, + "eval_runtime": 23.7966, + "eval_samples_per_second": 42.023, + "eval_steps_per_second": 2.647, + "step": 1040 + }, + { + "epoch": 1.5065123010130246, + "grad_norm": 1.4766031503677368, + "learning_rate": 1.432019308125503e-06, + "loss": 0.9484, + "step": 1041 + }, + { + "epoch": 1.507959479015919, + "grad_norm": 1.4830681085586548, + "learning_rate": 1.427996781979083e-06, + "loss": 0.9397, + "step": 1042 + }, + { + "epoch": 1.5094066570188134, + "grad_norm": 1.4869251251220703, + "learning_rate": 1.423974255832663e-06, + "loss": 0.9402, + "step": 1043 + }, + { + "epoch": 1.5108538350217078, + "grad_norm": 1.4764056205749512, + "learning_rate": 1.419951729686243e-06, + "loss": 0.9462, + "step": 1044 + }, + { + "epoch": 1.5123010130246022, + "grad_norm": 1.4674490690231323, + "learning_rate": 1.415929203539823e-06, + "loss": 0.9474, + "step": 1045 + }, + { + "epoch": 1.5137481910274964, + "grad_norm": 1.4767136573791504, + "learning_rate": 1.4119066773934031e-06, + "loss": 0.9242, + "step": 1046 + }, + { + "epoch": 1.5151953690303908, + "grad_norm": 1.4848299026489258, + "learning_rate": 1.4078841512469833e-06, + "loss": 0.899, + "step": 1047 + }, + { + "epoch": 1.516642547033285, + "grad_norm": 1.5661221742630005, + "learning_rate": 1.4038616251005632e-06, + "loss": 0.9421, + "step": 1048 + }, + { + "epoch": 1.5180897250361793, + "grad_norm": 1.4282113313674927, + "learning_rate": 1.3998390989541433e-06, + "loss": 0.9606, + "step": 1049 + }, + { + "epoch": 1.5195369030390737, + "grad_norm": 1.4687862396240234, + "learning_rate": 1.3958165728077232e-06, + "loss": 0.9312, + "step": 1050 + }, + { + "epoch": 1.5195369030390737, + "eval_loss": 1.0444204807281494, + "eval_runtime": 23.758, + "eval_samples_per_second": 42.091, + "eval_steps_per_second": 2.652, + "step": 1050 + }, + { + "epoch": 1.5209840810419681, + "grad_norm": 1.434773564338684, + "learning_rate": 1.3917940466613034e-06, + "loss": 0.9191, + "step": 1051 + }, + { + "epoch": 1.5224312590448625, + "grad_norm": 1.5584834814071655, + "learning_rate": 1.3877715205148835e-06, + "loss": 0.9414, + "step": 1052 + }, + { + "epoch": 1.523878437047757, + "grad_norm": 1.5007787942886353, + "learning_rate": 1.3837489943684634e-06, + "loss": 0.9599, + "step": 1053 + }, + { + "epoch": 1.5253256150506513, + "grad_norm": 1.578592300415039, + "learning_rate": 1.3797264682220435e-06, + "loss": 0.9589, + "step": 1054 + }, + { + "epoch": 1.5267727930535457, + "grad_norm": 1.4954944849014282, + "learning_rate": 1.3757039420756234e-06, + "loss": 0.94, + "step": 1055 + }, + { + "epoch": 1.52821997105644, + "grad_norm": 1.5336066484451294, + "learning_rate": 1.3716814159292036e-06, + "loss": 0.9336, + "step": 1056 + }, + { + "epoch": 1.5296671490593343, + "grad_norm": 1.4866039752960205, + "learning_rate": 1.3676588897827837e-06, + "loss": 0.9199, + "step": 1057 + }, + { + "epoch": 1.5311143270622285, + "grad_norm": 1.4646317958831787, + "learning_rate": 1.3636363636363636e-06, + "loss": 0.9141, + "step": 1058 + }, + { + "epoch": 1.532561505065123, + "grad_norm": 1.5008516311645508, + "learning_rate": 1.3596138374899437e-06, + "loss": 0.9268, + "step": 1059 + }, + { + "epoch": 1.5340086830680173, + "grad_norm": 1.5157554149627686, + "learning_rate": 1.3555913113435237e-06, + "loss": 0.9259, + "step": 1060 + }, + { + "epoch": 1.5340086830680173, + "eval_loss": 1.043142318725586, + "eval_runtime": 23.7193, + "eval_samples_per_second": 42.16, + "eval_steps_per_second": 2.656, + "step": 1060 + }, + { + "epoch": 1.5354558610709117, + "grad_norm": 1.5658875703811646, + "learning_rate": 1.3515687851971038e-06, + "loss": 0.9363, + "step": 1061 + }, + { + "epoch": 1.536903039073806, + "grad_norm": 1.5068069696426392, + "learning_rate": 1.347546259050684e-06, + "loss": 0.9453, + "step": 1062 + }, + { + "epoch": 1.5383502170767005, + "grad_norm": 1.3862087726593018, + "learning_rate": 1.3435237329042638e-06, + "loss": 0.9178, + "step": 1063 + }, + { + "epoch": 1.539797395079595, + "grad_norm": 1.4612709283828735, + "learning_rate": 1.339501206757844e-06, + "loss": 0.926, + "step": 1064 + }, + { + "epoch": 1.5412445730824893, + "grad_norm": 1.4535436630249023, + "learning_rate": 1.3354786806114243e-06, + "loss": 0.9288, + "step": 1065 + }, + { + "epoch": 1.5426917510853835, + "grad_norm": 1.5284571647644043, + "learning_rate": 1.3314561544650042e-06, + "loss": 0.9519, + "step": 1066 + }, + { + "epoch": 1.5441389290882779, + "grad_norm": 1.4716906547546387, + "learning_rate": 1.3274336283185843e-06, + "loss": 0.9187, + "step": 1067 + }, + { + "epoch": 1.545586107091172, + "grad_norm": 1.5695925951004028, + "learning_rate": 1.3234111021721643e-06, + "loss": 0.9387, + "step": 1068 + }, + { + "epoch": 1.5470332850940665, + "grad_norm": 1.4897812604904175, + "learning_rate": 1.3193885760257444e-06, + "loss": 0.926, + "step": 1069 + }, + { + "epoch": 1.5484804630969609, + "grad_norm": 1.5088276863098145, + "learning_rate": 1.3153660498793245e-06, + "loss": 0.9461, + "step": 1070 + }, + { + "epoch": 1.5484804630969609, + "eval_loss": 1.0454641580581665, + "eval_runtime": 23.7892, + "eval_samples_per_second": 42.036, + "eval_steps_per_second": 2.648, + "step": 1070 + }, + { + "epoch": 1.5499276410998553, + "grad_norm": 1.468730092048645, + "learning_rate": 1.3113435237329044e-06, + "loss": 0.947, + "step": 1071 + }, + { + "epoch": 1.5513748191027497, + "grad_norm": 1.4917826652526855, + "learning_rate": 1.3073209975864846e-06, + "loss": 0.9155, + "step": 1072 + }, + { + "epoch": 1.552821997105644, + "grad_norm": 1.51880943775177, + "learning_rate": 1.3032984714400645e-06, + "loss": 0.9264, + "step": 1073 + }, + { + "epoch": 1.5542691751085385, + "grad_norm": 1.5018504858016968, + "learning_rate": 1.2992759452936446e-06, + "loss": 0.9388, + "step": 1074 + }, + { + "epoch": 1.5557163531114329, + "grad_norm": 1.5512025356292725, + "learning_rate": 1.2952534191472247e-06, + "loss": 0.9388, + "step": 1075 + }, + { + "epoch": 1.557163531114327, + "grad_norm": 1.49564528465271, + "learning_rate": 1.2912308930008046e-06, + "loss": 0.9101, + "step": 1076 + }, + { + "epoch": 1.5586107091172214, + "grad_norm": 1.6322815418243408, + "learning_rate": 1.2872083668543848e-06, + "loss": 0.8976, + "step": 1077 + }, + { + "epoch": 1.5600578871201156, + "grad_norm": 1.5270243883132935, + "learning_rate": 1.2831858407079647e-06, + "loss": 0.9429, + "step": 1078 + }, + { + "epoch": 1.56150506512301, + "grad_norm": 1.4141950607299805, + "learning_rate": 1.2791633145615448e-06, + "loss": 0.8965, + "step": 1079 + }, + { + "epoch": 1.5629522431259044, + "grad_norm": 1.50956130027771, + "learning_rate": 1.275140788415125e-06, + "loss": 0.9001, + "step": 1080 + }, + { + "epoch": 1.5629522431259044, + "eval_loss": 1.044193148612976, + "eval_runtime": 23.8326, + "eval_samples_per_second": 41.959, + "eval_steps_per_second": 2.643, + "step": 1080 + }, + { + "epoch": 1.5643994211287988, + "grad_norm": 1.4992973804473877, + "learning_rate": 1.2711182622687049e-06, + "loss": 0.9656, + "step": 1081 + }, + { + "epoch": 1.5658465991316932, + "grad_norm": 1.4195139408111572, + "learning_rate": 1.267095736122285e-06, + "loss": 0.9353, + "step": 1082 + }, + { + "epoch": 1.5672937771345876, + "grad_norm": 1.4781336784362793, + "learning_rate": 1.263073209975865e-06, + "loss": 0.9449, + "step": 1083 + }, + { + "epoch": 1.568740955137482, + "grad_norm": 1.508346676826477, + "learning_rate": 1.259050683829445e-06, + "loss": 0.9496, + "step": 1084 + }, + { + "epoch": 1.5701881331403764, + "grad_norm": 1.449808955192566, + "learning_rate": 1.2550281576830252e-06, + "loss": 0.9038, + "step": 1085 + }, + { + "epoch": 1.5716353111432706, + "grad_norm": 1.445589303970337, + "learning_rate": 1.251005631536605e-06, + "loss": 0.9175, + "step": 1086 + }, + { + "epoch": 1.573082489146165, + "grad_norm": 1.4397412538528442, + "learning_rate": 1.2469831053901852e-06, + "loss": 0.9244, + "step": 1087 + }, + { + "epoch": 1.5745296671490592, + "grad_norm": 1.477348804473877, + "learning_rate": 1.2429605792437651e-06, + "loss": 0.95, + "step": 1088 + }, + { + "epoch": 1.5759768451519536, + "grad_norm": 1.5058271884918213, + "learning_rate": 1.2389380530973452e-06, + "loss": 0.9831, + "step": 1089 + }, + { + "epoch": 1.577424023154848, + "grad_norm": 1.5802448987960815, + "learning_rate": 1.2349155269509252e-06, + "loss": 0.9354, + "step": 1090 + }, + { + "epoch": 1.577424023154848, + "eval_loss": 1.043736219406128, + "eval_runtime": 23.7836, + "eval_samples_per_second": 42.046, + "eval_steps_per_second": 2.649, + "step": 1090 + }, + { + "epoch": 1.5788712011577424, + "grad_norm": 1.4747769832611084, + "learning_rate": 1.2308930008045053e-06, + "loss": 0.9321, + "step": 1091 + }, + { + "epoch": 1.5803183791606368, + "grad_norm": 1.5121163129806519, + "learning_rate": 1.2268704746580854e-06, + "loss": 0.9266, + "step": 1092 + }, + { + "epoch": 1.5817655571635312, + "grad_norm": 1.4969600439071655, + "learning_rate": 1.2228479485116653e-06, + "loss": 0.9163, + "step": 1093 + }, + { + "epoch": 1.5832127351664256, + "grad_norm": 1.4734262228012085, + "learning_rate": 1.2188254223652455e-06, + "loss": 0.9327, + "step": 1094 + }, + { + "epoch": 1.58465991316932, + "grad_norm": 1.4843584299087524, + "learning_rate": 1.2148028962188254e-06, + "loss": 0.9609, + "step": 1095 + }, + { + "epoch": 1.5861070911722142, + "grad_norm": 1.4566107988357544, + "learning_rate": 1.2107803700724055e-06, + "loss": 0.9168, + "step": 1096 + }, + { + "epoch": 1.5875542691751086, + "grad_norm": 1.432770013809204, + "learning_rate": 1.2067578439259856e-06, + "loss": 0.8865, + "step": 1097 + }, + { + "epoch": 1.5890014471780027, + "grad_norm": 1.471010446548462, + "learning_rate": 1.2027353177795658e-06, + "loss": 0.9133, + "step": 1098 + }, + { + "epoch": 1.5904486251808971, + "grad_norm": 1.4563469886779785, + "learning_rate": 1.1987127916331457e-06, + "loss": 0.9216, + "step": 1099 + }, + { + "epoch": 1.5918958031837915, + "grad_norm": 1.4933884143829346, + "learning_rate": 1.1946902654867258e-06, + "loss": 0.9346, + "step": 1100 + }, + { + "epoch": 1.5918958031837915, + "eval_loss": 1.042166829109192, + "eval_runtime": 23.7823, + "eval_samples_per_second": 42.048, + "eval_steps_per_second": 2.649, + "step": 1100 + }, + { + "epoch": 1.593342981186686, + "grad_norm": 1.4745537042617798, + "learning_rate": 1.190667739340306e-06, + "loss": 0.9427, + "step": 1101 + }, + { + "epoch": 1.5947901591895803, + "grad_norm": 1.425350546836853, + "learning_rate": 1.1866452131938858e-06, + "loss": 0.9171, + "step": 1102 + }, + { + "epoch": 1.5962373371924747, + "grad_norm": 1.4219218492507935, + "learning_rate": 1.182622687047466e-06, + "loss": 0.9375, + "step": 1103 + }, + { + "epoch": 1.5976845151953691, + "grad_norm": 1.522244930267334, + "learning_rate": 1.1786001609010459e-06, + "loss": 0.915, + "step": 1104 + }, + { + "epoch": 1.5991316931982635, + "grad_norm": 1.571389079093933, + "learning_rate": 1.174577634754626e-06, + "loss": 0.9314, + "step": 1105 + }, + { + "epoch": 1.6005788712011577, + "grad_norm": 1.4766952991485596, + "learning_rate": 1.1705551086082061e-06, + "loss": 0.912, + "step": 1106 + }, + { + "epoch": 1.6020260492040521, + "grad_norm": 1.4615398645401, + "learning_rate": 1.166532582461786e-06, + "loss": 0.9233, + "step": 1107 + }, + { + "epoch": 1.6034732272069463, + "grad_norm": 1.5288655757904053, + "learning_rate": 1.1625100563153662e-06, + "loss": 0.9053, + "step": 1108 + }, + { + "epoch": 1.6049204052098407, + "grad_norm": 1.5462576150894165, + "learning_rate": 1.158487530168946e-06, + "loss": 0.9213, + "step": 1109 + }, + { + "epoch": 1.606367583212735, + "grad_norm": 1.5279332399368286, + "learning_rate": 1.1544650040225262e-06, + "loss": 0.944, + "step": 1110 + }, + { + "epoch": 1.606367583212735, + "eval_loss": 1.0412065982818604, + "eval_runtime": 23.7191, + "eval_samples_per_second": 42.16, + "eval_steps_per_second": 2.656, + "step": 1110 + }, + { + "epoch": 1.6078147612156295, + "grad_norm": 1.4984546899795532, + "learning_rate": 1.1504424778761064e-06, + "loss": 0.9291, + "step": 1111 + }, + { + "epoch": 1.609261939218524, + "grad_norm": 1.5258090496063232, + "learning_rate": 1.1464199517296863e-06, + "loss": 0.9542, + "step": 1112 + }, + { + "epoch": 1.6107091172214183, + "grad_norm": 1.511910080909729, + "learning_rate": 1.1423974255832664e-06, + "loss": 0.94, + "step": 1113 + }, + { + "epoch": 1.6121562952243127, + "grad_norm": 1.5004017353057861, + "learning_rate": 1.1383748994368463e-06, + "loss": 0.9236, + "step": 1114 + }, + { + "epoch": 1.613603473227207, + "grad_norm": 1.461462378501892, + "learning_rate": 1.1343523732904264e-06, + "loss": 0.9213, + "step": 1115 + }, + { + "epoch": 1.6150506512301013, + "grad_norm": 1.5491946935653687, + "learning_rate": 1.1303298471440066e-06, + "loss": 0.9794, + "step": 1116 + }, + { + "epoch": 1.6164978292329957, + "grad_norm": 1.5602251291275024, + "learning_rate": 1.1263073209975865e-06, + "loss": 0.9351, + "step": 1117 + }, + { + "epoch": 1.6179450072358899, + "grad_norm": 1.5919190645217896, + "learning_rate": 1.1222847948511666e-06, + "loss": 0.9098, + "step": 1118 + }, + { + "epoch": 1.6193921852387843, + "grad_norm": 1.4915807247161865, + "learning_rate": 1.1182622687047465e-06, + "loss": 0.9518, + "step": 1119 + }, + { + "epoch": 1.6208393632416787, + "grad_norm": 1.551693081855774, + "learning_rate": 1.1142397425583269e-06, + "loss": 0.9395, + "step": 1120 + }, + { + "epoch": 1.6208393632416787, + "eval_loss": 1.0412206649780273, + "eval_runtime": 23.7936, + "eval_samples_per_second": 42.028, + "eval_steps_per_second": 2.648, + "step": 1120 + }, + { + "epoch": 1.622286541244573, + "grad_norm": 1.5632636547088623, + "learning_rate": 1.1102172164119068e-06, + "loss": 0.9418, + "step": 1121 + }, + { + "epoch": 1.6237337192474675, + "grad_norm": 1.5305129289627075, + "learning_rate": 1.106194690265487e-06, + "loss": 0.9521, + "step": 1122 + }, + { + "epoch": 1.6251808972503619, + "grad_norm": 1.5111137628555298, + "learning_rate": 1.1021721641190668e-06, + "loss": 0.916, + "step": 1123 + }, + { + "epoch": 1.6266280752532563, + "grad_norm": 1.45891535282135, + "learning_rate": 1.098149637972647e-06, + "loss": 0.9383, + "step": 1124 + }, + { + "epoch": 1.6280752532561507, + "grad_norm": 1.5136969089508057, + "learning_rate": 1.094127111826227e-06, + "loss": 0.9184, + "step": 1125 + }, + { + "epoch": 1.6295224312590448, + "grad_norm": 1.5937414169311523, + "learning_rate": 1.090104585679807e-06, + "loss": 0.9181, + "step": 1126 + }, + { + "epoch": 1.6309696092619392, + "grad_norm": 1.533246397972107, + "learning_rate": 1.0860820595333871e-06, + "loss": 0.9132, + "step": 1127 + }, + { + "epoch": 1.6324167872648334, + "grad_norm": 1.5128754377365112, + "learning_rate": 1.082059533386967e-06, + "loss": 0.9382, + "step": 1128 + }, + { + "epoch": 1.6338639652677278, + "grad_norm": 1.5426464080810547, + "learning_rate": 1.0780370072405472e-06, + "loss": 0.9418, + "step": 1129 + }, + { + "epoch": 1.6353111432706222, + "grad_norm": 1.5515700578689575, + "learning_rate": 1.0740144810941273e-06, + "loss": 0.9268, + "step": 1130 + }, + { + "epoch": 1.6353111432706222, + "eval_loss": 1.0395272970199585, + "eval_runtime": 23.7743, + "eval_samples_per_second": 42.062, + "eval_steps_per_second": 2.65, + "step": 1130 + }, + { + "epoch": 1.6367583212735166, + "grad_norm": 1.5215020179748535, + "learning_rate": 1.0699919549477072e-06, + "loss": 0.9166, + "step": 1131 + }, + { + "epoch": 1.638205499276411, + "grad_norm": 1.5605169534683228, + "learning_rate": 1.0659694288012873e-06, + "loss": 0.9391, + "step": 1132 + }, + { + "epoch": 1.6396526772793054, + "grad_norm": 1.5750532150268555, + "learning_rate": 1.0619469026548673e-06, + "loss": 0.899, + "step": 1133 + }, + { + "epoch": 1.6410998552821998, + "grad_norm": 1.5796549320220947, + "learning_rate": 1.0579243765084474e-06, + "loss": 0.9422, + "step": 1134 + }, + { + "epoch": 1.6425470332850942, + "grad_norm": 1.5482772588729858, + "learning_rate": 1.0539018503620275e-06, + "loss": 0.9583, + "step": 1135 + }, + { + "epoch": 1.6439942112879884, + "grad_norm": 1.534018874168396, + "learning_rate": 1.0498793242156074e-06, + "loss": 0.902, + "step": 1136 + }, + { + "epoch": 1.6454413892908828, + "grad_norm": 1.532336711883545, + "learning_rate": 1.0458567980691876e-06, + "loss": 0.9718, + "step": 1137 + }, + { + "epoch": 1.646888567293777, + "grad_norm": 1.5568121671676636, + "learning_rate": 1.0418342719227675e-06, + "loss": 0.9321, + "step": 1138 + }, + { + "epoch": 1.6483357452966714, + "grad_norm": 1.5286184549331665, + "learning_rate": 1.0378117457763476e-06, + "loss": 0.899, + "step": 1139 + }, + { + "epoch": 1.6497829232995658, + "grad_norm": 1.524092674255371, + "learning_rate": 1.0337892196299275e-06, + "loss": 0.9171, + "step": 1140 + }, + { + "epoch": 1.6497829232995658, + "eval_loss": 1.0385862588882446, + "eval_runtime": 23.8159, + "eval_samples_per_second": 41.989, + "eval_steps_per_second": 2.645, + "step": 1140 + }, + { + "epoch": 1.6512301013024602, + "grad_norm": 1.4606293439865112, + "learning_rate": 1.0297666934835076e-06, + "loss": 0.938, + "step": 1141 + }, + { + "epoch": 1.6526772793053546, + "grad_norm": 1.4511220455169678, + "learning_rate": 1.0257441673370878e-06, + "loss": 0.9096, + "step": 1142 + }, + { + "epoch": 1.654124457308249, + "grad_norm": 1.5572447776794434, + "learning_rate": 1.021721641190668e-06, + "loss": 0.9501, + "step": 1143 + }, + { + "epoch": 1.6555716353111434, + "grad_norm": 1.5110690593719482, + "learning_rate": 1.017699115044248e-06, + "loss": 0.9183, + "step": 1144 + }, + { + "epoch": 1.6570188133140378, + "grad_norm": 1.569759726524353, + "learning_rate": 1.013676588897828e-06, + "loss": 0.9424, + "step": 1145 + }, + { + "epoch": 1.658465991316932, + "grad_norm": 1.5364185571670532, + "learning_rate": 1.009654062751408e-06, + "loss": 0.9096, + "step": 1146 + }, + { + "epoch": 1.6599131693198264, + "grad_norm": 1.4737021923065186, + "learning_rate": 1.005631536604988e-06, + "loss": 0.9283, + "step": 1147 + }, + { + "epoch": 1.6613603473227205, + "grad_norm": 1.5624104738235474, + "learning_rate": 1.0016090104585681e-06, + "loss": 0.9483, + "step": 1148 + }, + { + "epoch": 1.662807525325615, + "grad_norm": 1.6266708374023438, + "learning_rate": 9.97586484312148e-07, + "loss": 0.917, + "step": 1149 + }, + { + "epoch": 1.6642547033285093, + "grad_norm": 1.5128072500228882, + "learning_rate": 9.935639581657282e-07, + "loss": 0.9143, + "step": 1150 + }, + { + "epoch": 1.6642547033285093, + "eval_loss": 1.0397101640701294, + "eval_runtime": 23.783, + "eval_samples_per_second": 42.047, + "eval_steps_per_second": 2.649, + "step": 1150 + }, + { + "epoch": 1.6657018813314037, + "grad_norm": 1.4989036321640015, + "learning_rate": 9.895414320193083e-07, + "loss": 0.9302, + "step": 1151 + }, + { + "epoch": 1.6671490593342981, + "grad_norm": 1.487831711769104, + "learning_rate": 9.855189058728882e-07, + "loss": 0.9368, + "step": 1152 + }, + { + "epoch": 1.6685962373371925, + "grad_norm": 1.501473069190979, + "learning_rate": 9.814963797264683e-07, + "loss": 0.9247, + "step": 1153 + }, + { + "epoch": 1.670043415340087, + "grad_norm": 1.5316646099090576, + "learning_rate": 9.774738535800482e-07, + "loss": 0.9197, + "step": 1154 + }, + { + "epoch": 1.6714905933429813, + "grad_norm": 1.5067342519760132, + "learning_rate": 9.734513274336284e-07, + "loss": 0.9531, + "step": 1155 + }, + { + "epoch": 1.6729377713458755, + "grad_norm": 1.550026774406433, + "learning_rate": 9.694288012872085e-07, + "loss": 0.9395, + "step": 1156 + }, + { + "epoch": 1.67438494934877, + "grad_norm": 1.425620436668396, + "learning_rate": 9.654062751407884e-07, + "loss": 0.9703, + "step": 1157 + }, + { + "epoch": 1.675832127351664, + "grad_norm": 1.4021658897399902, + "learning_rate": 9.613837489943685e-07, + "loss": 0.9102, + "step": 1158 + }, + { + "epoch": 1.6772793053545585, + "grad_norm": 1.5008976459503174, + "learning_rate": 9.573612228479485e-07, + "loss": 0.8952, + "step": 1159 + }, + { + "epoch": 1.678726483357453, + "grad_norm": 1.5313326120376587, + "learning_rate": 9.533386967015286e-07, + "loss": 0.9012, + "step": 1160 + }, + { + "epoch": 1.678726483357453, + "eval_loss": 1.037826657295227, + "eval_runtime": 23.8435, + "eval_samples_per_second": 41.94, + "eval_steps_per_second": 2.642, + "step": 1160 + }, + { + "epoch": 1.6801736613603473, + "grad_norm": 1.5341519117355347, + "learning_rate": 9.493161705551086e-07, + "loss": 0.8909, + "step": 1161 + }, + { + "epoch": 1.6816208393632417, + "grad_norm": 1.5521553754806519, + "learning_rate": 9.452936444086886e-07, + "loss": 0.893, + "step": 1162 + }, + { + "epoch": 1.683068017366136, + "grad_norm": 1.5209194421768188, + "learning_rate": 9.412711182622688e-07, + "loss": 0.8954, + "step": 1163 + }, + { + "epoch": 1.6845151953690305, + "grad_norm": 1.5101712942123413, + "learning_rate": 9.372485921158489e-07, + "loss": 0.9158, + "step": 1164 + }, + { + "epoch": 1.685962373371925, + "grad_norm": 1.5469932556152344, + "learning_rate": 9.332260659694289e-07, + "loss": 0.9238, + "step": 1165 + }, + { + "epoch": 1.687409551374819, + "grad_norm": 1.5016809701919556, + "learning_rate": 9.292035398230089e-07, + "loss": 0.9654, + "step": 1166 + }, + { + "epoch": 1.6888567293777135, + "grad_norm": 1.5810950994491577, + "learning_rate": 9.251810136765891e-07, + "loss": 0.9153, + "step": 1167 + }, + { + "epoch": 1.6903039073806077, + "grad_norm": 1.617267370223999, + "learning_rate": 9.211584875301691e-07, + "loss": 0.9745, + "step": 1168 + }, + { + "epoch": 1.691751085383502, + "grad_norm": 1.5095899105072021, + "learning_rate": 9.171359613837491e-07, + "loss": 0.9067, + "step": 1169 + }, + { + "epoch": 1.6931982633863965, + "grad_norm": 1.5304961204528809, + "learning_rate": 9.131134352373291e-07, + "loss": 0.9309, + "step": 1170 + }, + { + "epoch": 1.6931982633863965, + "eval_loss": 1.0388681888580322, + "eval_runtime": 23.7891, + "eval_samples_per_second": 42.036, + "eval_steps_per_second": 2.648, + "step": 1170 + }, + { + "epoch": 1.6946454413892909, + "grad_norm": 1.5729469060897827, + "learning_rate": 9.090909090909091e-07, + "loss": 0.922, + "step": 1171 + }, + { + "epoch": 1.6960926193921853, + "grad_norm": 1.5294268131256104, + "learning_rate": 9.050683829444893e-07, + "loss": 0.9132, + "step": 1172 + }, + { + "epoch": 1.6975397973950797, + "grad_norm": 1.5610600709915161, + "learning_rate": 9.010458567980693e-07, + "loss": 0.9338, + "step": 1173 + }, + { + "epoch": 1.698986975397974, + "grad_norm": 1.4757810831069946, + "learning_rate": 8.970233306516493e-07, + "loss": 0.9433, + "step": 1174 + }, + { + "epoch": 1.7004341534008685, + "grad_norm": 1.482692837715149, + "learning_rate": 8.930008045052293e-07, + "loss": 0.9371, + "step": 1175 + }, + { + "epoch": 1.7018813314037626, + "grad_norm": 1.6034132242202759, + "learning_rate": 8.889782783588094e-07, + "loss": 0.9288, + "step": 1176 + }, + { + "epoch": 1.703328509406657, + "grad_norm": 1.5254515409469604, + "learning_rate": 8.849557522123895e-07, + "loss": 0.914, + "step": 1177 + }, + { + "epoch": 1.7047756874095512, + "grad_norm": 1.514833688735962, + "learning_rate": 8.809332260659695e-07, + "loss": 0.9284, + "step": 1178 + }, + { + "epoch": 1.7062228654124456, + "grad_norm": 1.5230894088745117, + "learning_rate": 8.769106999195495e-07, + "loss": 0.9005, + "step": 1179 + }, + { + "epoch": 1.70767004341534, + "grad_norm": 1.5392180681228638, + "learning_rate": 8.728881737731296e-07, + "loss": 0.8974, + "step": 1180 + }, + { + "epoch": 1.70767004341534, + "eval_loss": 1.0386406183242798, + "eval_runtime": 23.8878, + "eval_samples_per_second": 41.862, + "eval_steps_per_second": 2.637, + "step": 1180 + }, + { + "epoch": 1.7091172214182344, + "grad_norm": 1.4879624843597412, + "learning_rate": 8.688656476267096e-07, + "loss": 0.932, + "step": 1181 + }, + { + "epoch": 1.7105643994211288, + "grad_norm": 1.6233673095703125, + "learning_rate": 8.648431214802896e-07, + "loss": 0.911, + "step": 1182 + }, + { + "epoch": 1.7120115774240232, + "grad_norm": 1.5614582300186157, + "learning_rate": 8.608205953338697e-07, + "loss": 0.9134, + "step": 1183 + }, + { + "epoch": 1.7134587554269176, + "grad_norm": 1.7120004892349243, + "learning_rate": 8.567980691874497e-07, + "loss": 0.9025, + "step": 1184 + }, + { + "epoch": 1.714905933429812, + "grad_norm": 1.6371057033538818, + "learning_rate": 8.527755430410298e-07, + "loss": 0.9404, + "step": 1185 + }, + { + "epoch": 1.7163531114327062, + "grad_norm": 1.5383172035217285, + "learning_rate": 8.487530168946098e-07, + "loss": 0.9525, + "step": 1186 + }, + { + "epoch": 1.7178002894356006, + "grad_norm": 1.5792254209518433, + "learning_rate": 8.4473049074819e-07, + "loss": 0.9459, + "step": 1187 + }, + { + "epoch": 1.7192474674384948, + "grad_norm": 1.5237640142440796, + "learning_rate": 8.4070796460177e-07, + "loss": 0.9305, + "step": 1188 + }, + { + "epoch": 1.7206946454413892, + "grad_norm": 1.5065224170684814, + "learning_rate": 8.366854384553501e-07, + "loss": 0.9442, + "step": 1189 + }, + { + "epoch": 1.7221418234442836, + "grad_norm": 1.5424607992172241, + "learning_rate": 8.326629123089301e-07, + "loss": 0.9376, + "step": 1190 + }, + { + "epoch": 1.7221418234442836, + "eval_loss": 1.0377265214920044, + "eval_runtime": 23.8236, + "eval_samples_per_second": 41.975, + "eval_steps_per_second": 2.644, + "step": 1190 + }, + { + "epoch": 1.723589001447178, + "grad_norm": 1.4731999635696411, + "learning_rate": 8.286403861625101e-07, + "loss": 0.9674, + "step": 1191 + }, + { + "epoch": 1.7250361794500724, + "grad_norm": 1.494404911994934, + "learning_rate": 8.246178600160902e-07, + "loss": 0.9267, + "step": 1192 + }, + { + "epoch": 1.7264833574529668, + "grad_norm": 1.461755633354187, + "learning_rate": 8.205953338696703e-07, + "loss": 0.9207, + "step": 1193 + }, + { + "epoch": 1.7279305354558612, + "grad_norm": 1.4410206079483032, + "learning_rate": 8.165728077232503e-07, + "loss": 0.92, + "step": 1194 + }, + { + "epoch": 1.7293777134587556, + "grad_norm": 1.6067509651184082, + "learning_rate": 8.125502815768303e-07, + "loss": 0.918, + "step": 1195 + }, + { + "epoch": 1.7308248914616498, + "grad_norm": 1.5347354412078857, + "learning_rate": 8.085277554304103e-07, + "loss": 0.9425, + "step": 1196 + }, + { + "epoch": 1.7322720694645442, + "grad_norm": 1.5482027530670166, + "learning_rate": 8.045052292839905e-07, + "loss": 0.9281, + "step": 1197 + }, + { + "epoch": 1.7337192474674383, + "grad_norm": 1.5965065956115723, + "learning_rate": 8.004827031375705e-07, + "loss": 0.9585, + "step": 1198 + }, + { + "epoch": 1.7351664254703327, + "grad_norm": 1.5257976055145264, + "learning_rate": 7.964601769911505e-07, + "loss": 0.9023, + "step": 1199 + }, + { + "epoch": 1.7366136034732271, + "grad_norm": 1.5278537273406982, + "learning_rate": 7.924376508447305e-07, + "loss": 0.9499, + "step": 1200 + }, + { + "epoch": 1.7366136034732271, + "eval_loss": 1.0380700826644897, + "eval_runtime": 23.8069, + "eval_samples_per_second": 42.005, + "eval_steps_per_second": 2.646, + "step": 1200 + }, + { + "epoch": 1.7380607814761215, + "grad_norm": 1.6284074783325195, + "learning_rate": 7.884151246983105e-07, + "loss": 0.9083, + "step": 1201 + }, + { + "epoch": 1.739507959479016, + "grad_norm": 1.5856940746307373, + "learning_rate": 7.843925985518907e-07, + "loss": 0.9344, + "step": 1202 + }, + { + "epoch": 1.7409551374819103, + "grad_norm": 1.5256085395812988, + "learning_rate": 7.803700724054707e-07, + "loss": 0.9111, + "step": 1203 + }, + { + "epoch": 1.7424023154848047, + "grad_norm": 1.5071070194244385, + "learning_rate": 7.763475462590507e-07, + "loss": 0.9117, + "step": 1204 + }, + { + "epoch": 1.7438494934876991, + "grad_norm": 1.506799578666687, + "learning_rate": 7.723250201126307e-07, + "loss": 0.9416, + "step": 1205 + }, + { + "epoch": 1.7452966714905933, + "grad_norm": 1.5650392770767212, + "learning_rate": 7.683024939662108e-07, + "loss": 0.9466, + "step": 1206 + }, + { + "epoch": 1.7467438494934877, + "grad_norm": 1.4933130741119385, + "learning_rate": 7.642799678197908e-07, + "loss": 0.9184, + "step": 1207 + }, + { + "epoch": 1.7481910274963819, + "grad_norm": 1.495843529701233, + "learning_rate": 7.602574416733709e-07, + "loss": 0.9199, + "step": 1208 + }, + { + "epoch": 1.7496382054992763, + "grad_norm": 1.531498908996582, + "learning_rate": 7.56234915526951e-07, + "loss": 0.95, + "step": 1209 + }, + { + "epoch": 1.7510853835021707, + "grad_norm": 1.4571256637573242, + "learning_rate": 7.522123893805311e-07, + "loss": 0.9021, + "step": 1210 + }, + { + "epoch": 1.7510853835021707, + "eval_loss": 1.0371652841567993, + "eval_runtime": 23.7513, + "eval_samples_per_second": 42.103, + "eval_steps_per_second": 2.652, + "step": 1210 + }, + { + "epoch": 1.752532561505065, + "grad_norm": 1.5395363569259644, + "learning_rate": 7.481898632341112e-07, + "loss": 0.9724, + "step": 1211 + }, + { + "epoch": 1.7539797395079595, + "grad_norm": 1.519291877746582, + "learning_rate": 7.441673370876912e-07, + "loss": 0.9193, + "step": 1212 + }, + { + "epoch": 1.755426917510854, + "grad_norm": 1.506554365158081, + "learning_rate": 7.401448109412712e-07, + "loss": 0.9097, + "step": 1213 + }, + { + "epoch": 1.7568740955137483, + "grad_norm": 1.586654543876648, + "learning_rate": 7.361222847948512e-07, + "loss": 0.9305, + "step": 1214 + }, + { + "epoch": 1.7583212735166427, + "grad_norm": 1.4750105142593384, + "learning_rate": 7.320997586484313e-07, + "loss": 0.9179, + "step": 1215 + }, + { + "epoch": 1.7597684515195369, + "grad_norm": 1.518219232559204, + "learning_rate": 7.280772325020113e-07, + "loss": 0.9354, + "step": 1216 + }, + { + "epoch": 1.7612156295224313, + "grad_norm": 1.4926543235778809, + "learning_rate": 7.240547063555914e-07, + "loss": 0.9396, + "step": 1217 + }, + { + "epoch": 1.7626628075253257, + "grad_norm": 1.4630756378173828, + "learning_rate": 7.200321802091714e-07, + "loss": 0.9202, + "step": 1218 + }, + { + "epoch": 1.7641099855282198, + "grad_norm": 1.5604909658432007, + "learning_rate": 7.160096540627515e-07, + "loss": 0.922, + "step": 1219 + }, + { + "epoch": 1.7655571635311142, + "grad_norm": 1.4380903244018555, + "learning_rate": 7.119871279163315e-07, + "loss": 0.9154, + "step": 1220 + }, + { + "epoch": 1.7655571635311142, + "eval_loss": 1.0385264158248901, + "eval_runtime": 23.7683, + "eval_samples_per_second": 42.073, + "eval_steps_per_second": 2.651, + "step": 1220 + }, + { + "epoch": 1.7670043415340086, + "grad_norm": 1.5557941198349, + "learning_rate": 7.079646017699115e-07, + "loss": 0.9175, + "step": 1221 + }, + { + "epoch": 1.768451519536903, + "grad_norm": 1.5896649360656738, + "learning_rate": 7.039420756234916e-07, + "loss": 0.9355, + "step": 1222 + }, + { + "epoch": 1.7698986975397974, + "grad_norm": 1.5210115909576416, + "learning_rate": 6.999195494770717e-07, + "loss": 0.947, + "step": 1223 + }, + { + "epoch": 1.7713458755426919, + "grad_norm": 1.4857864379882812, + "learning_rate": 6.958970233306517e-07, + "loss": 0.9202, + "step": 1224 + }, + { + "epoch": 1.7727930535455863, + "grad_norm": 1.506723403930664, + "learning_rate": 6.918744971842317e-07, + "loss": 0.913, + "step": 1225 + }, + { + "epoch": 1.7742402315484804, + "grad_norm": 1.5433999300003052, + "learning_rate": 6.878519710378117e-07, + "loss": 0.9441, + "step": 1226 + }, + { + "epoch": 1.7756874095513748, + "grad_norm": 1.4916256666183472, + "learning_rate": 6.838294448913918e-07, + "loss": 0.8972, + "step": 1227 + }, + { + "epoch": 1.7771345875542692, + "grad_norm": 1.47489595413208, + "learning_rate": 6.798069187449719e-07, + "loss": 0.9157, + "step": 1228 + }, + { + "epoch": 1.7785817655571634, + "grad_norm": 1.5604667663574219, + "learning_rate": 6.757843925985519e-07, + "loss": 0.8873, + "step": 1229 + }, + { + "epoch": 1.7800289435600578, + "grad_norm": 1.482895851135254, + "learning_rate": 6.717618664521319e-07, + "loss": 0.935, + "step": 1230 + }, + { + "epoch": 1.7800289435600578, + "eval_loss": 1.036209225654602, + "eval_runtime": 23.7895, + "eval_samples_per_second": 42.035, + "eval_steps_per_second": 2.648, + "step": 1230 + }, + { + "epoch": 1.7814761215629522, + "grad_norm": 1.4609028100967407, + "learning_rate": 6.677393403057121e-07, + "loss": 0.9331, + "step": 1231 + }, + { + "epoch": 1.7829232995658466, + "grad_norm": 1.4962233304977417, + "learning_rate": 6.637168141592922e-07, + "loss": 0.9226, + "step": 1232 + }, + { + "epoch": 1.784370477568741, + "grad_norm": 1.4788625240325928, + "learning_rate": 6.596942880128722e-07, + "loss": 0.9243, + "step": 1233 + }, + { + "epoch": 1.7858176555716354, + "grad_norm": 1.5288554430007935, + "learning_rate": 6.556717618664522e-07, + "loss": 0.9268, + "step": 1234 + }, + { + "epoch": 1.7872648335745298, + "grad_norm": 1.5255603790283203, + "learning_rate": 6.516492357200322e-07, + "loss": 0.9371, + "step": 1235 + }, + { + "epoch": 1.788712011577424, + "grad_norm": 1.5164626836776733, + "learning_rate": 6.476267095736124e-07, + "loss": 0.9228, + "step": 1236 + }, + { + "epoch": 1.7901591895803184, + "grad_norm": 1.5369902849197388, + "learning_rate": 6.436041834271924e-07, + "loss": 0.9028, + "step": 1237 + }, + { + "epoch": 1.7916063675832128, + "grad_norm": 1.4825102090835571, + "learning_rate": 6.395816572807724e-07, + "loss": 0.9297, + "step": 1238 + }, + { + "epoch": 1.793053545586107, + "grad_norm": 1.505773901939392, + "learning_rate": 6.355591311343524e-07, + "loss": 0.9476, + "step": 1239 + }, + { + "epoch": 1.7945007235890014, + "grad_norm": 1.5148533582687378, + "learning_rate": 6.315366049879324e-07, + "loss": 0.9226, + "step": 1240 + }, + { + "epoch": 1.7945007235890014, + "eval_loss": 1.0356080532073975, + "eval_runtime": 23.7488, + "eval_samples_per_second": 42.107, + "eval_steps_per_second": 2.653, + "step": 1240 + }, + { + "epoch": 1.7959479015918958, + "grad_norm": 1.5147360563278198, + "learning_rate": 6.275140788415126e-07, + "loss": 0.903, + "step": 1241 + }, + { + "epoch": 1.7973950795947902, + "grad_norm": 1.572090744972229, + "learning_rate": 6.234915526950926e-07, + "loss": 0.9437, + "step": 1242 + }, + { + "epoch": 1.7988422575976846, + "grad_norm": 1.5051279067993164, + "learning_rate": 6.194690265486726e-07, + "loss": 0.9458, + "step": 1243 + }, + { + "epoch": 1.800289435600579, + "grad_norm": 1.4279484748840332, + "learning_rate": 6.154465004022526e-07, + "loss": 0.9115, + "step": 1244 + }, + { + "epoch": 1.8017366136034734, + "grad_norm": 1.5181655883789062, + "learning_rate": 6.114239742558327e-07, + "loss": 0.9218, + "step": 1245 + }, + { + "epoch": 1.8031837916063675, + "grad_norm": 1.558053970336914, + "learning_rate": 6.074014481094127e-07, + "loss": 0.9378, + "step": 1246 + }, + { + "epoch": 1.804630969609262, + "grad_norm": 1.5189166069030762, + "learning_rate": 6.033789219629928e-07, + "loss": 0.9436, + "step": 1247 + }, + { + "epoch": 1.8060781476121563, + "grad_norm": 1.5421959161758423, + "learning_rate": 5.993563958165728e-07, + "loss": 0.9194, + "step": 1248 + }, + { + "epoch": 1.8075253256150505, + "grad_norm": 1.4810371398925781, + "learning_rate": 5.95333869670153e-07, + "loss": 0.9439, + "step": 1249 + }, + { + "epoch": 1.808972503617945, + "grad_norm": 1.41682767868042, + "learning_rate": 5.91311343523733e-07, + "loss": 0.9309, + "step": 1250 + }, + { + "epoch": 1.808972503617945, + "eval_loss": 1.0347821712493896, + "eval_runtime": 23.7782, + "eval_samples_per_second": 42.055, + "eval_steps_per_second": 2.649, + "step": 1250 + }, + { + "epoch": 1.8104196816208393, + "grad_norm": 1.5308350324630737, + "learning_rate": 5.87288817377313e-07, + "loss": 0.9153, + "step": 1251 + }, + { + "epoch": 1.8118668596237337, + "grad_norm": 1.4509097337722778, + "learning_rate": 5.83266291230893e-07, + "loss": 0.9193, + "step": 1252 + }, + { + "epoch": 1.8133140376266281, + "grad_norm": 1.4815465211868286, + "learning_rate": 5.79243765084473e-07, + "loss": 0.918, + "step": 1253 + }, + { + "epoch": 1.8147612156295225, + "grad_norm": 1.4463987350463867, + "learning_rate": 5.752212389380532e-07, + "loss": 0.9226, + "step": 1254 + }, + { + "epoch": 1.816208393632417, + "grad_norm": 1.494334101676941, + "learning_rate": 5.711987127916332e-07, + "loss": 0.952, + "step": 1255 + }, + { + "epoch": 1.817655571635311, + "grad_norm": 1.540571689605713, + "learning_rate": 5.671761866452132e-07, + "loss": 0.9016, + "step": 1256 + }, + { + "epoch": 1.8191027496382055, + "grad_norm": 1.5218087434768677, + "learning_rate": 5.631536604987932e-07, + "loss": 0.9188, + "step": 1257 + }, + { + "epoch": 1.8205499276411, + "grad_norm": 1.4856946468353271, + "learning_rate": 5.591311343523733e-07, + "loss": 0.9111, + "step": 1258 + }, + { + "epoch": 1.821997105643994, + "grad_norm": 1.5391303300857544, + "learning_rate": 5.551086082059534e-07, + "loss": 0.9255, + "step": 1259 + }, + { + "epoch": 1.8234442836468885, + "grad_norm": 1.5547950267791748, + "learning_rate": 5.510860820595334e-07, + "loss": 0.9458, + "step": 1260 + }, + { + "epoch": 1.8234442836468885, + "eval_loss": 1.0358872413635254, + "eval_runtime": 23.7224, + "eval_samples_per_second": 42.154, + "eval_steps_per_second": 2.656, + "step": 1260 + }, + { + "epoch": 1.8248914616497829, + "grad_norm": 1.5779333114624023, + "learning_rate": 5.470635559131135e-07, + "loss": 0.9281, + "step": 1261 + }, + { + "epoch": 1.8263386396526773, + "grad_norm": 1.5649796724319458, + "learning_rate": 5.430410297666936e-07, + "loss": 0.9, + "step": 1262 + }, + { + "epoch": 1.8277858176555717, + "grad_norm": 1.4962213039398193, + "learning_rate": 5.390185036202736e-07, + "loss": 0.8924, + "step": 1263 + }, + { + "epoch": 1.829232995658466, + "grad_norm": 1.5567576885223389, + "learning_rate": 5.349959774738536e-07, + "loss": 0.9399, + "step": 1264 + }, + { + "epoch": 1.8306801736613605, + "grad_norm": 1.4440345764160156, + "learning_rate": 5.309734513274336e-07, + "loss": 0.9205, + "step": 1265 + }, + { + "epoch": 1.8321273516642547, + "grad_norm": 1.4883509874343872, + "learning_rate": 5.269509251810138e-07, + "loss": 0.8948, + "step": 1266 + }, + { + "epoch": 1.833574529667149, + "grad_norm": 1.5461758375167847, + "learning_rate": 5.229283990345938e-07, + "loss": 0.9293, + "step": 1267 + }, + { + "epoch": 1.8350217076700435, + "grad_norm": 1.531106948852539, + "learning_rate": 5.189058728881738e-07, + "loss": 0.8835, + "step": 1268 + }, + { + "epoch": 1.8364688856729376, + "grad_norm": 1.5766586065292358, + "learning_rate": 5.148833467417538e-07, + "loss": 0.9322, + "step": 1269 + }, + { + "epoch": 1.837916063675832, + "grad_norm": 1.5971323251724243, + "learning_rate": 5.10860820595334e-07, + "loss": 0.9516, + "step": 1270 + }, + { + "epoch": 1.837916063675832, + "eval_loss": 1.0358165502548218, + "eval_runtime": 23.8409, + "eval_samples_per_second": 41.945, + "eval_steps_per_second": 2.643, + "step": 1270 + }, + { + "epoch": 1.8393632416787264, + "grad_norm": 1.4922758340835571, + "learning_rate": 5.06838294448914e-07, + "loss": 0.9198, + "step": 1271 + }, + { + "epoch": 1.8408104196816208, + "grad_norm": 1.5153942108154297, + "learning_rate": 5.02815768302494e-07, + "loss": 0.9401, + "step": 1272 + }, + { + "epoch": 1.8422575976845152, + "grad_norm": 1.4995721578598022, + "learning_rate": 4.98793242156074e-07, + "loss": 0.9061, + "step": 1273 + }, + { + "epoch": 1.8437047756874096, + "grad_norm": 1.5555169582366943, + "learning_rate": 4.947707160096541e-07, + "loss": 0.9458, + "step": 1274 + }, + { + "epoch": 1.845151953690304, + "grad_norm": 1.5097185373306274, + "learning_rate": 4.907481898632342e-07, + "loss": 0.9247, + "step": 1275 + }, + { + "epoch": 1.8465991316931982, + "grad_norm": 1.6253676414489746, + "learning_rate": 4.867256637168142e-07, + "loss": 0.912, + "step": 1276 + }, + { + "epoch": 1.8480463096960926, + "grad_norm": 1.4750977754592896, + "learning_rate": 4.827031375703942e-07, + "loss": 0.932, + "step": 1277 + }, + { + "epoch": 1.849493487698987, + "grad_norm": 1.5606588125228882, + "learning_rate": 4.786806114239742e-07, + "loss": 0.9365, + "step": 1278 + }, + { + "epoch": 1.8509406657018812, + "grad_norm": 1.6095157861709595, + "learning_rate": 4.746580852775543e-07, + "loss": 0.905, + "step": 1279 + }, + { + "epoch": 1.8523878437047756, + "grad_norm": 1.490276575088501, + "learning_rate": 4.706355591311344e-07, + "loss": 0.9243, + "step": 1280 + }, + { + "epoch": 1.8523878437047756, + "eval_loss": 1.035555362701416, + "eval_runtime": 23.8144, + "eval_samples_per_second": 41.991, + "eval_steps_per_second": 2.645, + "step": 1280 + }, + { + "epoch": 1.85383502170767, + "grad_norm": 1.511925458908081, + "learning_rate": 4.6661303298471445e-07, + "loss": 0.8957, + "step": 1281 + }, + { + "epoch": 1.8552821997105644, + "grad_norm": 1.5050299167633057, + "learning_rate": 4.6259050683829453e-07, + "loss": 0.888, + "step": 1282 + }, + { + "epoch": 1.8567293777134588, + "grad_norm": 1.5600636005401611, + "learning_rate": 4.5856798069187455e-07, + "loss": 0.9203, + "step": 1283 + }, + { + "epoch": 1.8581765557163532, + "grad_norm": 1.5475517511367798, + "learning_rate": 4.5454545454545457e-07, + "loss": 0.9517, + "step": 1284 + }, + { + "epoch": 1.8596237337192476, + "grad_norm": 1.5837122201919556, + "learning_rate": 4.5052292839903465e-07, + "loss": 0.9168, + "step": 1285 + }, + { + "epoch": 1.8610709117221418, + "grad_norm": 1.5051043033599854, + "learning_rate": 4.4650040225261467e-07, + "loss": 0.9306, + "step": 1286 + }, + { + "epoch": 1.8625180897250362, + "grad_norm": 1.452130675315857, + "learning_rate": 4.4247787610619474e-07, + "loss": 0.9049, + "step": 1287 + }, + { + "epoch": 1.8639652677279306, + "grad_norm": 1.5144529342651367, + "learning_rate": 4.3845534995977477e-07, + "loss": 0.9194, + "step": 1288 + }, + { + "epoch": 1.8654124457308248, + "grad_norm": 1.518511414527893, + "learning_rate": 4.344328238133548e-07, + "loss": 0.9384, + "step": 1289 + }, + { + "epoch": 1.8668596237337192, + "grad_norm": 1.548079013824463, + "learning_rate": 4.3041029766693486e-07, + "loss": 0.9424, + "step": 1290 + }, + { + "epoch": 1.8668596237337192, + "eval_loss": 1.0355305671691895, + "eval_runtime": 23.8154, + "eval_samples_per_second": 41.99, + "eval_steps_per_second": 2.645, + "step": 1290 + }, + { + "epoch": 1.8683068017366136, + "grad_norm": 1.5100760459899902, + "learning_rate": 4.263877715205149e-07, + "loss": 0.8995, + "step": 1291 + }, + { + "epoch": 1.869753979739508, + "grad_norm": 1.4751876592636108, + "learning_rate": 4.22365245374095e-07, + "loss": 0.9154, + "step": 1292 + }, + { + "epoch": 1.8712011577424024, + "grad_norm": 1.53620183467865, + "learning_rate": 4.1834271922767503e-07, + "loss": 0.9313, + "step": 1293 + }, + { + "epoch": 1.8726483357452968, + "grad_norm": 1.5257290601730347, + "learning_rate": 4.1432019308125506e-07, + "loss": 0.8978, + "step": 1294 + }, + { + "epoch": 1.8740955137481912, + "grad_norm": 1.5697894096374512, + "learning_rate": 4.1029766693483513e-07, + "loss": 0.9544, + "step": 1295 + }, + { + "epoch": 1.8755426917510853, + "grad_norm": 1.5875900983810425, + "learning_rate": 4.0627514078841515e-07, + "loss": 0.9363, + "step": 1296 + }, + { + "epoch": 1.8769898697539797, + "grad_norm": 1.4910507202148438, + "learning_rate": 4.0225261464199523e-07, + "loss": 0.9377, + "step": 1297 + }, + { + "epoch": 1.8784370477568741, + "grad_norm": 1.4932316541671753, + "learning_rate": 3.9823008849557525e-07, + "loss": 0.9071, + "step": 1298 + }, + { + "epoch": 1.8798842257597683, + "grad_norm": 1.5614124536514282, + "learning_rate": 3.9420756234915527e-07, + "loss": 0.9224, + "step": 1299 + }, + { + "epoch": 1.8813314037626627, + "grad_norm": 1.5470384359359741, + "learning_rate": 3.9018503620273535e-07, + "loss": 0.9328, + "step": 1300 + }, + { + "epoch": 1.8813314037626627, + "eval_loss": 1.0352489948272705, + "eval_runtime": 23.8699, + "eval_samples_per_second": 41.894, + "eval_steps_per_second": 2.639, + "step": 1300 + }, + { + "epoch": 1.8827785817655571, + "grad_norm": 1.4845808744430542, + "learning_rate": 3.8616251005631537e-07, + "loss": 0.9304, + "step": 1301 + }, + { + "epoch": 1.8842257597684515, + "grad_norm": 1.4711196422576904, + "learning_rate": 3.821399839098954e-07, + "loss": 0.9427, + "step": 1302 + }, + { + "epoch": 1.885672937771346, + "grad_norm": 1.5020239353179932, + "learning_rate": 3.781174577634755e-07, + "loss": 0.9281, + "step": 1303 + }, + { + "epoch": 1.8871201157742403, + "grad_norm": 1.5225720405578613, + "learning_rate": 3.740949316170556e-07, + "loss": 0.9145, + "step": 1304 + }, + { + "epoch": 1.8885672937771347, + "grad_norm": 1.5384893417358398, + "learning_rate": 3.700724054706356e-07, + "loss": 0.9054, + "step": 1305 + }, + { + "epoch": 1.890014471780029, + "grad_norm": 1.5327421426773071, + "learning_rate": 3.6604987932421563e-07, + "loss": 0.8853, + "step": 1306 + }, + { + "epoch": 1.8914616497829233, + "grad_norm": 1.5172675848007202, + "learning_rate": 3.620273531777957e-07, + "loss": 0.9215, + "step": 1307 + }, + { + "epoch": 1.8929088277858177, + "grad_norm": 1.5183223485946655, + "learning_rate": 3.5800482703137573e-07, + "loss": 0.9274, + "step": 1308 + }, + { + "epoch": 1.8943560057887119, + "grad_norm": 1.5297460556030273, + "learning_rate": 3.5398230088495575e-07, + "loss": 0.9285, + "step": 1309 + }, + { + "epoch": 1.8958031837916063, + "grad_norm": 1.4771268367767334, + "learning_rate": 3.4995977473853583e-07, + "loss": 0.9306, + "step": 1310 + }, + { + "epoch": 1.8958031837916063, + "eval_loss": 1.035001516342163, + "eval_runtime": 23.7771, + "eval_samples_per_second": 42.057, + "eval_steps_per_second": 2.65, + "step": 1310 + }, + { + "epoch": 1.8972503617945007, + "grad_norm": 1.5330641269683838, + "learning_rate": 3.4593724859211585e-07, + "loss": 0.918, + "step": 1311 + }, + { + "epoch": 1.898697539797395, + "grad_norm": 1.5373104810714722, + "learning_rate": 3.419147224456959e-07, + "loss": 0.9098, + "step": 1312 + }, + { + "epoch": 1.9001447178002895, + "grad_norm": 1.50400710105896, + "learning_rate": 3.3789219629927595e-07, + "loss": 0.9291, + "step": 1313 + }, + { + "epoch": 1.9015918958031839, + "grad_norm": 1.5028769969940186, + "learning_rate": 3.338696701528561e-07, + "loss": 0.9249, + "step": 1314 + }, + { + "epoch": 1.9030390738060783, + "grad_norm": 1.5838807821273804, + "learning_rate": 3.298471440064361e-07, + "loss": 0.9415, + "step": 1315 + }, + { + "epoch": 1.9044862518089725, + "grad_norm": 1.5784761905670166, + "learning_rate": 3.258246178600161e-07, + "loss": 0.9048, + "step": 1316 + }, + { + "epoch": 1.9059334298118669, + "grad_norm": 1.4618431329727173, + "learning_rate": 3.218020917135962e-07, + "loss": 0.9001, + "step": 1317 + }, + { + "epoch": 1.9073806078147613, + "grad_norm": 1.4717321395874023, + "learning_rate": 3.177795655671762e-07, + "loss": 0.9402, + "step": 1318 + }, + { + "epoch": 1.9088277858176554, + "grad_norm": 1.5131185054779053, + "learning_rate": 3.137570394207563e-07, + "loss": 0.9148, + "step": 1319 + }, + { + "epoch": 1.9102749638205498, + "grad_norm": 1.4573482275009155, + "learning_rate": 3.097345132743363e-07, + "loss": 0.8981, + "step": 1320 + }, + { + "epoch": 1.9102749638205498, + "eval_loss": 1.0335999727249146, + "eval_runtime": 23.8385, + "eval_samples_per_second": 41.949, + "eval_steps_per_second": 2.643, + "step": 1320 + }, + { + "epoch": 1.9117221418234442, + "grad_norm": 1.5507259368896484, + "learning_rate": 3.0571198712791633e-07, + "loss": 0.9407, + "step": 1321 + }, + { + "epoch": 1.9131693198263386, + "grad_norm": 1.5146931409835815, + "learning_rate": 3.016894609814964e-07, + "loss": 0.9387, + "step": 1322 + }, + { + "epoch": 1.914616497829233, + "grad_norm": 1.5338430404663086, + "learning_rate": 2.976669348350765e-07, + "loss": 0.9416, + "step": 1323 + }, + { + "epoch": 1.9160636758321274, + "grad_norm": 1.5134283304214478, + "learning_rate": 2.936444086886565e-07, + "loss": 0.9197, + "step": 1324 + }, + { + "epoch": 1.9175108538350218, + "grad_norm": 1.5270968675613403, + "learning_rate": 2.896218825422365e-07, + "loss": 0.9199, + "step": 1325 + }, + { + "epoch": 1.918958031837916, + "grad_norm": 1.4260802268981934, + "learning_rate": 2.855993563958166e-07, + "loss": 0.9138, + "step": 1326 + }, + { + "epoch": 1.9204052098408104, + "grad_norm": 1.5940711498260498, + "learning_rate": 2.815768302493966e-07, + "loss": 0.9686, + "step": 1327 + }, + { + "epoch": 1.9218523878437048, + "grad_norm": 1.4830875396728516, + "learning_rate": 2.775543041029767e-07, + "loss": 0.8988, + "step": 1328 + }, + { + "epoch": 1.923299565846599, + "grad_norm": 1.510819673538208, + "learning_rate": 2.7353177795655677e-07, + "loss": 0.9154, + "step": 1329 + }, + { + "epoch": 1.9247467438494934, + "grad_norm": 1.5255513191223145, + "learning_rate": 2.695092518101368e-07, + "loss": 0.9336, + "step": 1330 + }, + { + "epoch": 1.9247467438494934, + "eval_loss": 1.0334358215332031, + "eval_runtime": 23.8036, + "eval_samples_per_second": 42.011, + "eval_steps_per_second": 2.647, + "step": 1330 + }, + { + "epoch": 1.9261939218523878, + "grad_norm": 1.5139683485031128, + "learning_rate": 2.654867256637168e-07, + "loss": 0.9396, + "step": 1331 + }, + { + "epoch": 1.9276410998552822, + "grad_norm": 1.5131778717041016, + "learning_rate": 2.614641995172969e-07, + "loss": 0.8926, + "step": 1332 + }, + { + "epoch": 1.9290882778581766, + "grad_norm": 1.519347071647644, + "learning_rate": 2.574416733708769e-07, + "loss": 0.9139, + "step": 1333 + }, + { + "epoch": 1.930535455861071, + "grad_norm": 1.6185059547424316, + "learning_rate": 2.53419147224457e-07, + "loss": 0.9485, + "step": 1334 + }, + { + "epoch": 1.9319826338639654, + "grad_norm": 1.4924019575119019, + "learning_rate": 2.49396621078037e-07, + "loss": 0.9135, + "step": 1335 + }, + { + "epoch": 1.9334298118668596, + "grad_norm": 1.474297046661377, + "learning_rate": 2.453740949316171e-07, + "loss": 0.9005, + "step": 1336 + }, + { + "epoch": 1.934876989869754, + "grad_norm": 1.504992961883545, + "learning_rate": 2.413515687851971e-07, + "loss": 0.9289, + "step": 1337 + }, + { + "epoch": 1.9363241678726484, + "grad_norm": 1.4555882215499878, + "learning_rate": 2.3732904263877715e-07, + "loss": 0.9172, + "step": 1338 + }, + { + "epoch": 1.9377713458755426, + "grad_norm": 1.5565623044967651, + "learning_rate": 2.3330651649235723e-07, + "loss": 0.9413, + "step": 1339 + }, + { + "epoch": 1.939218523878437, + "grad_norm": 1.5923689603805542, + "learning_rate": 2.2928399034593728e-07, + "loss": 0.8993, + "step": 1340 + }, + { + "epoch": 1.939218523878437, + "eval_loss": 1.0337510108947754, + "eval_runtime": 23.7954, + "eval_samples_per_second": 42.025, + "eval_steps_per_second": 2.648, + "step": 1340 + }, + { + "epoch": 1.9406657018813314, + "grad_norm": 1.5740083456039429, + "learning_rate": 2.2526146419951732e-07, + "loss": 0.9171, + "step": 1341 + }, + { + "epoch": 1.9421128798842258, + "grad_norm": 1.5902602672576904, + "learning_rate": 2.2123893805309737e-07, + "loss": 0.8975, + "step": 1342 + }, + { + "epoch": 1.9435600578871202, + "grad_norm": 1.4776948690414429, + "learning_rate": 2.172164119066774e-07, + "loss": 0.9305, + "step": 1343 + }, + { + "epoch": 1.9450072358900146, + "grad_norm": 1.475582242012024, + "learning_rate": 2.1319388576025744e-07, + "loss": 0.9395, + "step": 1344 + }, + { + "epoch": 1.946454413892909, + "grad_norm": 1.5595847368240356, + "learning_rate": 2.0917135961383752e-07, + "loss": 0.955, + "step": 1345 + }, + { + "epoch": 1.9479015918958031, + "grad_norm": 1.541603922843933, + "learning_rate": 2.0514883346741757e-07, + "loss": 0.9679, + "step": 1346 + }, + { + "epoch": 1.9493487698986975, + "grad_norm": 1.5499169826507568, + "learning_rate": 2.0112630732099761e-07, + "loss": 0.9036, + "step": 1347 + }, + { + "epoch": 1.950795947901592, + "grad_norm": 1.4720523357391357, + "learning_rate": 1.9710378117457764e-07, + "loss": 0.9221, + "step": 1348 + }, + { + "epoch": 1.9522431259044861, + "grad_norm": 1.4793193340301514, + "learning_rate": 1.9308125502815768e-07, + "loss": 0.9036, + "step": 1349 + }, + { + "epoch": 1.9536903039073805, + "grad_norm": 1.5033717155456543, + "learning_rate": 1.8905872888173776e-07, + "loss": 0.949, + "step": 1350 + }, + { + "epoch": 1.9536903039073805, + "eval_loss": 1.0329711437225342, + "eval_runtime": 23.7939, + "eval_samples_per_second": 42.028, + "eval_steps_per_second": 2.648, + "step": 1350 + }, + { + "epoch": 1.955137481910275, + "grad_norm": 1.5453612804412842, + "learning_rate": 1.850362027353178e-07, + "loss": 0.9101, + "step": 1351 + }, + { + "epoch": 1.9565846599131693, + "grad_norm": 1.5425647497177124, + "learning_rate": 1.8101367658889785e-07, + "loss": 0.9204, + "step": 1352 + }, + { + "epoch": 1.9580318379160637, + "grad_norm": 1.5732142925262451, + "learning_rate": 1.7699115044247788e-07, + "loss": 0.9343, + "step": 1353 + }, + { + "epoch": 1.9594790159189581, + "grad_norm": 1.5321398973464966, + "learning_rate": 1.7296862429605792e-07, + "loss": 0.9484, + "step": 1354 + }, + { + "epoch": 1.9609261939218525, + "grad_norm": 1.563483715057373, + "learning_rate": 1.6894609814963797e-07, + "loss": 0.9418, + "step": 1355 + }, + { + "epoch": 1.9623733719247467, + "grad_norm": 1.496347188949585, + "learning_rate": 1.6492357200321805e-07, + "loss": 0.9045, + "step": 1356 + }, + { + "epoch": 1.963820549927641, + "grad_norm": 1.5437716245651245, + "learning_rate": 1.609010458567981e-07, + "loss": 0.9257, + "step": 1357 + }, + { + "epoch": 1.9652677279305355, + "grad_norm": 1.486731767654419, + "learning_rate": 1.5687851971037814e-07, + "loss": 0.8979, + "step": 1358 + }, + { + "epoch": 1.9667149059334297, + "grad_norm": 1.4655122756958008, + "learning_rate": 1.5285599356395817e-07, + "loss": 0.9108, + "step": 1359 + }, + { + "epoch": 1.968162083936324, + "grad_norm": 1.5855398178100586, + "learning_rate": 1.4883346741753824e-07, + "loss": 0.9525, + "step": 1360 + }, + { + "epoch": 1.968162083936324, + "eval_loss": 1.0326709747314453, + "eval_runtime": 23.7066, + "eval_samples_per_second": 42.182, + "eval_steps_per_second": 2.657, + "step": 1360 + }, + { + "epoch": 1.9696092619392185, + "grad_norm": 1.4950917959213257, + "learning_rate": 1.4481094127111826e-07, + "loss": 0.9118, + "step": 1361 + }, + { + "epoch": 1.9710564399421129, + "grad_norm": 1.5167419910430908, + "learning_rate": 1.407884151246983e-07, + "loss": 0.9378, + "step": 1362 + }, + { + "epoch": 1.9725036179450073, + "grad_norm": 1.49794340133667, + "learning_rate": 1.3676588897827839e-07, + "loss": 0.9199, + "step": 1363 + }, + { + "epoch": 1.9739507959479017, + "grad_norm": 1.5848218202590942, + "learning_rate": 1.327433628318584e-07, + "loss": 0.9398, + "step": 1364 + }, + { + "epoch": 1.975397973950796, + "grad_norm": 1.4511967897415161, + "learning_rate": 1.2872083668543846e-07, + "loss": 0.9183, + "step": 1365 + }, + { + "epoch": 1.9768451519536903, + "grad_norm": 1.489917516708374, + "learning_rate": 1.246983105390185e-07, + "loss": 0.9444, + "step": 1366 + }, + { + "epoch": 1.9782923299565847, + "grad_norm": 1.555161952972412, + "learning_rate": 1.2067578439259855e-07, + "loss": 0.9551, + "step": 1367 + }, + { + "epoch": 1.979739507959479, + "grad_norm": 1.5618079900741577, + "learning_rate": 1.1665325824617861e-07, + "loss": 0.8975, + "step": 1368 + }, + { + "epoch": 1.9811866859623732, + "grad_norm": 1.47573721408844, + "learning_rate": 1.1263073209975866e-07, + "loss": 0.9261, + "step": 1369 + }, + { + "epoch": 1.9826338639652676, + "grad_norm": 1.5010806322097778, + "learning_rate": 1.086082059533387e-07, + "loss": 0.9582, + "step": 1370 + }, + { + "epoch": 1.9826338639652676, + "eval_loss": 1.0328552722930908, + "eval_runtime": 23.7804, + "eval_samples_per_second": 42.052, + "eval_steps_per_second": 2.649, + "step": 1370 + }, + { + "epoch": 1.984081041968162, + "grad_norm": 1.5046898126602173, + "learning_rate": 1.0458567980691876e-07, + "loss": 0.9139, + "step": 1371 + }, + { + "epoch": 1.9855282199710564, + "grad_norm": 1.517932653427124, + "learning_rate": 1.0056315366049881e-07, + "loss": 0.9076, + "step": 1372 + }, + { + "epoch": 1.9869753979739508, + "grad_norm": 1.6246222257614136, + "learning_rate": 9.654062751407884e-08, + "loss": 0.9419, + "step": 1373 + }, + { + "epoch": 1.9884225759768452, + "grad_norm": 1.5156887769699097, + "learning_rate": 9.25181013676589e-08, + "loss": 0.9408, + "step": 1374 + }, + { + "epoch": 1.9898697539797396, + "grad_norm": 1.5260882377624512, + "learning_rate": 8.849557522123894e-08, + "loss": 0.909, + "step": 1375 + }, + { + "epoch": 1.9913169319826338, + "grad_norm": 1.4769291877746582, + "learning_rate": 8.447304907481899e-08, + "loss": 0.8988, + "step": 1376 + }, + { + "epoch": 1.9927641099855282, + "grad_norm": 1.4575843811035156, + "learning_rate": 8.045052292839905e-08, + "loss": 0.9249, + "step": 1377 + }, + { + "epoch": 1.9942112879884226, + "grad_norm": 1.5837633609771729, + "learning_rate": 7.642799678197908e-08, + "loss": 0.9313, + "step": 1378 + }, + { + "epoch": 1.9956584659913168, + "grad_norm": 1.495152235031128, + "learning_rate": 7.240547063555913e-08, + "loss": 0.9183, + "step": 1379 + }, + { + "epoch": 1.9971056439942112, + "grad_norm": 1.4821348190307617, + "learning_rate": 6.838294448913919e-08, + "loss": 0.9064, + "step": 1380 + }, + { + "epoch": 1.9971056439942112, + "eval_loss": 1.032882571220398, + "eval_runtime": 23.8637, + "eval_samples_per_second": 41.905, + "eval_steps_per_second": 2.64, + "step": 1380 + }, + { + "epoch": 1.9985528219971056, + "grad_norm": 1.5770803689956665, + "learning_rate": 6.436041834271923e-08, + "loss": 0.934, + "step": 1381 + }, + { + "epoch": 2.0, + "grad_norm": 1.5071070194244385, + "learning_rate": 6.033789219629928e-08, + "loss": 0.9017, + "step": 1382 + } + ], + "logging_steps": 1, + "max_steps": 1382, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.654550286644019e+17, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +}