|
{ |
|
"best_metric": 1.091291069984436, |
|
"best_model_checkpoint": "vietnamese-emb-long-mlm/checkpoint-8000", |
|
"epoch": 5.0, |
|
"eval_steps": 500, |
|
"global_step": 8400, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05952380952380952, |
|
"grad_norm": 7.383795261383057, |
|
"learning_rate": 4.940476190476191e-05, |
|
"loss": 6.9284, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.11904761904761904, |
|
"grad_norm": 11.806465148925781, |
|
"learning_rate": 4.880952380952381e-05, |
|
"loss": 5.4103, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.17857142857142858, |
|
"grad_norm": 10.51978588104248, |
|
"learning_rate": 4.8214285714285716e-05, |
|
"loss": 4.45, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.23809523809523808, |
|
"grad_norm": 10.803540229797363, |
|
"learning_rate": 4.761904761904762e-05, |
|
"loss": 3.7495, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2976190476190476, |
|
"grad_norm": 10.87264347076416, |
|
"learning_rate": 4.7023809523809525e-05, |
|
"loss": 3.3683, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.2976190476190476, |
|
"eval_loss": 2.989553928375244, |
|
"eval_runtime": 105.8686, |
|
"eval_samples_per_second": 112.838, |
|
"eval_steps_per_second": 1.766, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.35714285714285715, |
|
"grad_norm": 9.709155082702637, |
|
"learning_rate": 4.642857142857143e-05, |
|
"loss": 3.0939, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.4166666666666667, |
|
"grad_norm": 9.323984146118164, |
|
"learning_rate": 4.5833333333333334e-05, |
|
"loss": 2.8534, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.47619047619047616, |
|
"grad_norm": 10.167367935180664, |
|
"learning_rate": 4.523809523809524e-05, |
|
"loss": 2.7091, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.5357142857142857, |
|
"grad_norm": 10.095072746276855, |
|
"learning_rate": 4.464285714285715e-05, |
|
"loss": 2.5711, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.5952380952380952, |
|
"grad_norm": 9.4666109085083, |
|
"learning_rate": 4.404761904761905e-05, |
|
"loss": 2.4663, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.5952380952380952, |
|
"eval_loss": 2.2190322875976562, |
|
"eval_runtime": 106.1834, |
|
"eval_samples_per_second": 112.504, |
|
"eval_steps_per_second": 1.761, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.6547619047619048, |
|
"grad_norm": 9.085387229919434, |
|
"learning_rate": 4.345238095238096e-05, |
|
"loss": 2.4001, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 9.016592979431152, |
|
"learning_rate": 4.2857142857142856e-05, |
|
"loss": 2.3036, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.7738095238095238, |
|
"grad_norm": 10.062590599060059, |
|
"learning_rate": 4.226190476190476e-05, |
|
"loss": 2.2139, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 9.198400497436523, |
|
"learning_rate": 4.166666666666667e-05, |
|
"loss": 2.1477, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.8928571428571429, |
|
"grad_norm": 9.555102348327637, |
|
"learning_rate": 4.107142857142857e-05, |
|
"loss": 2.078, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.8928571428571429, |
|
"eval_loss": 1.8398191928863525, |
|
"eval_runtime": 106.2078, |
|
"eval_samples_per_second": 112.478, |
|
"eval_steps_per_second": 1.761, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.9523809523809523, |
|
"grad_norm": 9.729957580566406, |
|
"learning_rate": 4.047619047619048e-05, |
|
"loss": 2.0146, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.0119047619047619, |
|
"grad_norm": 9.68885326385498, |
|
"learning_rate": 3.9880952380952386e-05, |
|
"loss": 1.9381, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.0714285714285714, |
|
"grad_norm": 8.330371856689453, |
|
"learning_rate": 3.928571428571429e-05, |
|
"loss": 1.8773, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.130952380952381, |
|
"grad_norm": 9.066886901855469, |
|
"learning_rate": 3.8690476190476195e-05, |
|
"loss": 1.8749, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.1904761904761905, |
|
"grad_norm": 10.275750160217285, |
|
"learning_rate": 3.809523809523809e-05, |
|
"loss": 1.8204, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.1904761904761905, |
|
"eval_loss": 1.667816400527954, |
|
"eval_runtime": 106.1061, |
|
"eval_samples_per_second": 112.585, |
|
"eval_steps_per_second": 1.762, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 9.68263053894043, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 1.8137, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.3095238095238095, |
|
"grad_norm": 10.254537582397461, |
|
"learning_rate": 3.690476190476191e-05, |
|
"loss": 1.8096, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.369047619047619, |
|
"grad_norm": 8.361231803894043, |
|
"learning_rate": 3.630952380952381e-05, |
|
"loss": 1.7509, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 9.450984954833984, |
|
"learning_rate": 3.571428571428572e-05, |
|
"loss": 1.7277, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.4880952380952381, |
|
"grad_norm": 8.3367280960083, |
|
"learning_rate": 3.511904761904762e-05, |
|
"loss": 1.7063, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.4880952380952381, |
|
"eval_loss": 1.55353844165802, |
|
"eval_runtime": 106.2169, |
|
"eval_samples_per_second": 112.468, |
|
"eval_steps_per_second": 1.761, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.5476190476190477, |
|
"grad_norm": 8.55213451385498, |
|
"learning_rate": 3.4523809523809526e-05, |
|
"loss": 1.6952, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.6071428571428572, |
|
"grad_norm": 8.386811256408691, |
|
"learning_rate": 3.392857142857143e-05, |
|
"loss": 1.6654, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 9.841432571411133, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 1.6662, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.7261904761904763, |
|
"grad_norm": 8.69044303894043, |
|
"learning_rate": 3.273809523809524e-05, |
|
"loss": 1.6291, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.7857142857142856, |
|
"grad_norm": 9.23985481262207, |
|
"learning_rate": 3.2142857142857144e-05, |
|
"loss": 1.5943, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.7857142857142856, |
|
"eval_loss": 1.464246392250061, |
|
"eval_runtime": 106.2444, |
|
"eval_samples_per_second": 112.439, |
|
"eval_steps_per_second": 1.76, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.8452380952380953, |
|
"grad_norm": 8.489648818969727, |
|
"learning_rate": 3.154761904761905e-05, |
|
"loss": 1.6024, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.9047619047619047, |
|
"grad_norm": 8.592900276184082, |
|
"learning_rate": 3.095238095238095e-05, |
|
"loss": 1.5711, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.9642857142857144, |
|
"grad_norm": 8.73353099822998, |
|
"learning_rate": 3.0357142857142857e-05, |
|
"loss": 1.561, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 2.0238095238095237, |
|
"grad_norm": 7.832152843475342, |
|
"learning_rate": 2.9761904761904762e-05, |
|
"loss": 1.4996, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 2.0833333333333335, |
|
"grad_norm": 8.37968921661377, |
|
"learning_rate": 2.916666666666667e-05, |
|
"loss": 1.5235, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 2.0833333333333335, |
|
"eval_loss": 1.3883955478668213, |
|
"eval_runtime": 106.3163, |
|
"eval_samples_per_second": 112.363, |
|
"eval_steps_per_second": 1.759, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 2.142857142857143, |
|
"grad_norm": 9.074413299560547, |
|
"learning_rate": 2.857142857142857e-05, |
|
"loss": 1.4867, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 2.2023809523809526, |
|
"grad_norm": 8.715752601623535, |
|
"learning_rate": 2.797619047619048e-05, |
|
"loss": 1.481, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 2.261904761904762, |
|
"grad_norm": 8.675792694091797, |
|
"learning_rate": 2.7380952380952383e-05, |
|
"loss": 1.4462, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 2.3214285714285716, |
|
"grad_norm": 8.826884269714355, |
|
"learning_rate": 2.6785714285714288e-05, |
|
"loss": 1.449, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 2.380952380952381, |
|
"grad_norm": 8.927966117858887, |
|
"learning_rate": 2.6190476190476192e-05, |
|
"loss": 1.4307, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.380952380952381, |
|
"eval_loss": 1.3017064332962036, |
|
"eval_runtime": 106.1765, |
|
"eval_samples_per_second": 112.511, |
|
"eval_steps_per_second": 1.761, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.4404761904761907, |
|
"grad_norm": 7.1491804122924805, |
|
"learning_rate": 2.5595238095238093e-05, |
|
"loss": 1.4789, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 7.50277853012085, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.4296, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 2.5595238095238093, |
|
"grad_norm": 8.132680892944336, |
|
"learning_rate": 2.4404761904761906e-05, |
|
"loss": 1.4177, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 2.619047619047619, |
|
"grad_norm": 8.233142852783203, |
|
"learning_rate": 2.380952380952381e-05, |
|
"loss": 1.3953, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 2.678571428571429, |
|
"grad_norm": 7.6400628089904785, |
|
"learning_rate": 2.3214285714285715e-05, |
|
"loss": 1.4063, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.678571428571429, |
|
"eval_loss": 1.2739932537078857, |
|
"eval_runtime": 106.1766, |
|
"eval_samples_per_second": 112.511, |
|
"eval_steps_per_second": 1.761, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.738095238095238, |
|
"grad_norm": 7.416648864746094, |
|
"learning_rate": 2.261904761904762e-05, |
|
"loss": 1.4008, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 2.7976190476190474, |
|
"grad_norm": 7.283276081085205, |
|
"learning_rate": 2.2023809523809524e-05, |
|
"loss": 1.3818, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 8.360782623291016, |
|
"learning_rate": 2.1428571428571428e-05, |
|
"loss": 1.379, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 2.9166666666666665, |
|
"grad_norm": 9.960053443908691, |
|
"learning_rate": 2.0833333333333336e-05, |
|
"loss": 1.3479, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 2.9761904761904763, |
|
"grad_norm": 8.903584480285645, |
|
"learning_rate": 2.023809523809524e-05, |
|
"loss": 1.3389, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.9761904761904763, |
|
"eval_loss": 1.2187227010726929, |
|
"eval_runtime": 106.0745, |
|
"eval_samples_per_second": 112.619, |
|
"eval_steps_per_second": 1.763, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 3.0357142857142856, |
|
"grad_norm": 8.299568176269531, |
|
"learning_rate": 1.9642857142857145e-05, |
|
"loss": 1.3336, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 3.0952380952380953, |
|
"grad_norm": 8.194002151489258, |
|
"learning_rate": 1.9047619047619046e-05, |
|
"loss": 1.3102, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 3.1547619047619047, |
|
"grad_norm": 7.804888725280762, |
|
"learning_rate": 1.8452380952380954e-05, |
|
"loss": 1.3354, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 3.2142857142857144, |
|
"grad_norm": 7.510008811950684, |
|
"learning_rate": 1.785714285714286e-05, |
|
"loss": 1.3382, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 3.2738095238095237, |
|
"grad_norm": 7.9406023025512695, |
|
"learning_rate": 1.7261904761904763e-05, |
|
"loss": 1.2996, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 3.2738095238095237, |
|
"eval_loss": 1.2163845300674438, |
|
"eval_runtime": 106.2374, |
|
"eval_samples_per_second": 112.446, |
|
"eval_steps_per_second": 1.76, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"grad_norm": 7.935986042022705, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 1.2889, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 3.392857142857143, |
|
"grad_norm": 7.514946937561035, |
|
"learning_rate": 1.6071428571428572e-05, |
|
"loss": 1.3353, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 3.4523809523809526, |
|
"grad_norm": 8.081690788269043, |
|
"learning_rate": 1.5476190476190476e-05, |
|
"loss": 1.2959, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 3.511904761904762, |
|
"grad_norm": 8.02477741241455, |
|
"learning_rate": 1.4880952380952381e-05, |
|
"loss": 1.2467, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 3.571428571428571, |
|
"grad_norm": 8.462018013000488, |
|
"learning_rate": 1.4285714285714285e-05, |
|
"loss": 1.2899, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 3.571428571428571, |
|
"eval_loss": 1.1731750965118408, |
|
"eval_runtime": 106.1002, |
|
"eval_samples_per_second": 112.592, |
|
"eval_steps_per_second": 1.762, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 3.630952380952381, |
|
"grad_norm": 7.650486469268799, |
|
"learning_rate": 1.3690476190476192e-05, |
|
"loss": 1.2878, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 3.6904761904761907, |
|
"grad_norm": 8.889339447021484, |
|
"learning_rate": 1.3095238095238096e-05, |
|
"loss": 1.2704, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 9.502975463867188, |
|
"learning_rate": 1.25e-05, |
|
"loss": 1.2406, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 3.8095238095238093, |
|
"grad_norm": 9.480072975158691, |
|
"learning_rate": 1.1904761904761905e-05, |
|
"loss": 1.2707, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 3.869047619047619, |
|
"grad_norm": 7.378728866577148, |
|
"learning_rate": 1.130952380952381e-05, |
|
"loss": 1.2343, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 3.869047619047619, |
|
"eval_loss": 1.1631048917770386, |
|
"eval_runtime": 106.023, |
|
"eval_samples_per_second": 112.674, |
|
"eval_steps_per_second": 1.764, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 3.928571428571429, |
|
"grad_norm": 8.64068603515625, |
|
"learning_rate": 1.0714285714285714e-05, |
|
"loss": 1.2478, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 3.988095238095238, |
|
"grad_norm": 8.00733470916748, |
|
"learning_rate": 1.011904761904762e-05, |
|
"loss": 1.2414, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 4.0476190476190474, |
|
"grad_norm": 7.568075180053711, |
|
"learning_rate": 9.523809523809523e-06, |
|
"loss": 1.2533, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 4.107142857142857, |
|
"grad_norm": 7.190471649169922, |
|
"learning_rate": 8.92857142857143e-06, |
|
"loss": 1.2495, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 4.166666666666667, |
|
"grad_norm": 8.588766098022461, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 1.2095, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 4.166666666666667, |
|
"eval_loss": 1.0942662954330444, |
|
"eval_runtime": 106.0638, |
|
"eval_samples_per_second": 112.63, |
|
"eval_steps_per_second": 1.763, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 4.226190476190476, |
|
"grad_norm": 8.47437858581543, |
|
"learning_rate": 7.738095238095238e-06, |
|
"loss": 1.2134, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 4.285714285714286, |
|
"grad_norm": 7.481777191162109, |
|
"learning_rate": 7.142857142857143e-06, |
|
"loss": 1.1873, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 4.345238095238095, |
|
"grad_norm": 7.533321380615234, |
|
"learning_rate": 6.547619047619048e-06, |
|
"loss": 1.2022, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 4.404761904761905, |
|
"grad_norm": 8.106829643249512, |
|
"learning_rate": 5.9523809523809525e-06, |
|
"loss": 1.2162, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 4.464285714285714, |
|
"grad_norm": 7.893404483795166, |
|
"learning_rate": 5.357142857142857e-06, |
|
"loss": 1.2078, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 4.464285714285714, |
|
"eval_loss": 1.1191558837890625, |
|
"eval_runtime": 106.3099, |
|
"eval_samples_per_second": 112.37, |
|
"eval_steps_per_second": 1.759, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 4.523809523809524, |
|
"grad_norm": 8.90477466583252, |
|
"learning_rate": 4.7619047619047615e-06, |
|
"loss": 1.219, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 4.583333333333333, |
|
"grad_norm": 7.603702545166016, |
|
"learning_rate": 4.166666666666667e-06, |
|
"loss": 1.2, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 4.642857142857143, |
|
"grad_norm": 8.275321006774902, |
|
"learning_rate": 3.5714285714285714e-06, |
|
"loss": 1.1985, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 4.7023809523809526, |
|
"grad_norm": 8.652661323547363, |
|
"learning_rate": 2.9761904761904763e-06, |
|
"loss": 1.2421, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 4.761904761904762, |
|
"grad_norm": 8.569392204284668, |
|
"learning_rate": 2.3809523809523808e-06, |
|
"loss": 1.208, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 4.761904761904762, |
|
"eval_loss": 1.091291069984436, |
|
"eval_runtime": 106.5156, |
|
"eval_samples_per_second": 112.153, |
|
"eval_steps_per_second": 1.756, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 4.821428571428571, |
|
"grad_norm": 8.579488754272461, |
|
"learning_rate": 1.7857142857142857e-06, |
|
"loss": 1.2274, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 4.880952380952381, |
|
"grad_norm": 9.347880363464355, |
|
"learning_rate": 1.1904761904761904e-06, |
|
"loss": 1.1833, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 4.940476190476191, |
|
"grad_norm": 7.7748894691467285, |
|
"learning_rate": 5.952380952380952e-07, |
|
"loss": 1.1919, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 8.465712547302246, |
|
"learning_rate": 0.0, |
|
"loss": 1.2069, |
|
"step": 8400 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 8400, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 2000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.5633072459126792e+16, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|