{ "best_metric": 1.091291069984436, "best_model_checkpoint": "vietnamese-emb-long-mlm/checkpoint-8000", "epoch": 5.0, "eval_steps": 500, "global_step": 8400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05952380952380952, "grad_norm": 7.383795261383057, "learning_rate": 4.940476190476191e-05, "loss": 6.9284, "step": 100 }, { "epoch": 0.11904761904761904, "grad_norm": 11.806465148925781, "learning_rate": 4.880952380952381e-05, "loss": 5.4103, "step": 200 }, { "epoch": 0.17857142857142858, "grad_norm": 10.51978588104248, "learning_rate": 4.8214285714285716e-05, "loss": 4.45, "step": 300 }, { "epoch": 0.23809523809523808, "grad_norm": 10.803540229797363, "learning_rate": 4.761904761904762e-05, "loss": 3.7495, "step": 400 }, { "epoch": 0.2976190476190476, "grad_norm": 10.87264347076416, "learning_rate": 4.7023809523809525e-05, "loss": 3.3683, "step": 500 }, { "epoch": 0.2976190476190476, "eval_loss": 2.989553928375244, "eval_runtime": 105.8686, "eval_samples_per_second": 112.838, "eval_steps_per_second": 1.766, "step": 500 }, { "epoch": 0.35714285714285715, "grad_norm": 9.709155082702637, "learning_rate": 4.642857142857143e-05, "loss": 3.0939, "step": 600 }, { "epoch": 0.4166666666666667, "grad_norm": 9.323984146118164, "learning_rate": 4.5833333333333334e-05, "loss": 2.8534, "step": 700 }, { "epoch": 0.47619047619047616, "grad_norm": 10.167367935180664, "learning_rate": 4.523809523809524e-05, "loss": 2.7091, "step": 800 }, { "epoch": 0.5357142857142857, "grad_norm": 10.095072746276855, "learning_rate": 4.464285714285715e-05, "loss": 2.5711, "step": 900 }, { "epoch": 0.5952380952380952, "grad_norm": 9.4666109085083, "learning_rate": 4.404761904761905e-05, "loss": 2.4663, "step": 1000 }, { "epoch": 0.5952380952380952, "eval_loss": 2.2190322875976562, "eval_runtime": 106.1834, "eval_samples_per_second": 112.504, "eval_steps_per_second": 1.761, "step": 1000 }, { "epoch": 0.6547619047619048, "grad_norm": 9.085387229919434, "learning_rate": 4.345238095238096e-05, "loss": 2.4001, "step": 1100 }, { "epoch": 0.7142857142857143, "grad_norm": 9.016592979431152, "learning_rate": 4.2857142857142856e-05, "loss": 2.3036, "step": 1200 }, { "epoch": 0.7738095238095238, "grad_norm": 10.062590599060059, "learning_rate": 4.226190476190476e-05, "loss": 2.2139, "step": 1300 }, { "epoch": 0.8333333333333334, "grad_norm": 9.198400497436523, "learning_rate": 4.166666666666667e-05, "loss": 2.1477, "step": 1400 }, { "epoch": 0.8928571428571429, "grad_norm": 9.555102348327637, "learning_rate": 4.107142857142857e-05, "loss": 2.078, "step": 1500 }, { "epoch": 0.8928571428571429, "eval_loss": 1.8398191928863525, "eval_runtime": 106.2078, "eval_samples_per_second": 112.478, "eval_steps_per_second": 1.761, "step": 1500 }, { "epoch": 0.9523809523809523, "grad_norm": 9.729957580566406, "learning_rate": 4.047619047619048e-05, "loss": 2.0146, "step": 1600 }, { "epoch": 1.0119047619047619, "grad_norm": 9.68885326385498, "learning_rate": 3.9880952380952386e-05, "loss": 1.9381, "step": 1700 }, { "epoch": 1.0714285714285714, "grad_norm": 8.330371856689453, "learning_rate": 3.928571428571429e-05, "loss": 1.8773, "step": 1800 }, { "epoch": 1.130952380952381, "grad_norm": 9.066886901855469, "learning_rate": 3.8690476190476195e-05, "loss": 1.8749, "step": 1900 }, { "epoch": 1.1904761904761905, "grad_norm": 10.275750160217285, "learning_rate": 3.809523809523809e-05, "loss": 1.8204, "step": 2000 }, { "epoch": 1.1904761904761905, "eval_loss": 1.667816400527954, "eval_runtime": 106.1061, "eval_samples_per_second": 112.585, "eval_steps_per_second": 1.762, "step": 2000 }, { "epoch": 1.25, "grad_norm": 9.68263053894043, "learning_rate": 3.7500000000000003e-05, "loss": 1.8137, "step": 2100 }, { "epoch": 1.3095238095238095, "grad_norm": 10.254537582397461, "learning_rate": 3.690476190476191e-05, "loss": 1.8096, "step": 2200 }, { "epoch": 1.369047619047619, "grad_norm": 8.361231803894043, "learning_rate": 3.630952380952381e-05, "loss": 1.7509, "step": 2300 }, { "epoch": 1.4285714285714286, "grad_norm": 9.450984954833984, "learning_rate": 3.571428571428572e-05, "loss": 1.7277, "step": 2400 }, { "epoch": 1.4880952380952381, "grad_norm": 8.3367280960083, "learning_rate": 3.511904761904762e-05, "loss": 1.7063, "step": 2500 }, { "epoch": 1.4880952380952381, "eval_loss": 1.55353844165802, "eval_runtime": 106.2169, "eval_samples_per_second": 112.468, "eval_steps_per_second": 1.761, "step": 2500 }, { "epoch": 1.5476190476190477, "grad_norm": 8.55213451385498, "learning_rate": 3.4523809523809526e-05, "loss": 1.6952, "step": 2600 }, { "epoch": 1.6071428571428572, "grad_norm": 8.386811256408691, "learning_rate": 3.392857142857143e-05, "loss": 1.6654, "step": 2700 }, { "epoch": 1.6666666666666665, "grad_norm": 9.841432571411133, "learning_rate": 3.3333333333333335e-05, "loss": 1.6662, "step": 2800 }, { "epoch": 1.7261904761904763, "grad_norm": 8.69044303894043, "learning_rate": 3.273809523809524e-05, "loss": 1.6291, "step": 2900 }, { "epoch": 1.7857142857142856, "grad_norm": 9.23985481262207, "learning_rate": 3.2142857142857144e-05, "loss": 1.5943, "step": 3000 }, { "epoch": 1.7857142857142856, "eval_loss": 1.464246392250061, "eval_runtime": 106.2444, "eval_samples_per_second": 112.439, "eval_steps_per_second": 1.76, "step": 3000 }, { "epoch": 1.8452380952380953, "grad_norm": 8.489648818969727, "learning_rate": 3.154761904761905e-05, "loss": 1.6024, "step": 3100 }, { "epoch": 1.9047619047619047, "grad_norm": 8.592900276184082, "learning_rate": 3.095238095238095e-05, "loss": 1.5711, "step": 3200 }, { "epoch": 1.9642857142857144, "grad_norm": 8.73353099822998, "learning_rate": 3.0357142857142857e-05, "loss": 1.561, "step": 3300 }, { "epoch": 2.0238095238095237, "grad_norm": 7.832152843475342, "learning_rate": 2.9761904761904762e-05, "loss": 1.4996, "step": 3400 }, { "epoch": 2.0833333333333335, "grad_norm": 8.37968921661377, "learning_rate": 2.916666666666667e-05, "loss": 1.5235, "step": 3500 }, { "epoch": 2.0833333333333335, "eval_loss": 1.3883955478668213, "eval_runtime": 106.3163, "eval_samples_per_second": 112.363, "eval_steps_per_second": 1.759, "step": 3500 }, { "epoch": 2.142857142857143, "grad_norm": 9.074413299560547, "learning_rate": 2.857142857142857e-05, "loss": 1.4867, "step": 3600 }, { "epoch": 2.2023809523809526, "grad_norm": 8.715752601623535, "learning_rate": 2.797619047619048e-05, "loss": 1.481, "step": 3700 }, { "epoch": 2.261904761904762, "grad_norm": 8.675792694091797, "learning_rate": 2.7380952380952383e-05, "loss": 1.4462, "step": 3800 }, { "epoch": 2.3214285714285716, "grad_norm": 8.826884269714355, "learning_rate": 2.6785714285714288e-05, "loss": 1.449, "step": 3900 }, { "epoch": 2.380952380952381, "grad_norm": 8.927966117858887, "learning_rate": 2.6190476190476192e-05, "loss": 1.4307, "step": 4000 }, { "epoch": 2.380952380952381, "eval_loss": 1.3017064332962036, "eval_runtime": 106.1765, "eval_samples_per_second": 112.511, "eval_steps_per_second": 1.761, "step": 4000 }, { "epoch": 2.4404761904761907, "grad_norm": 7.1491804122924805, "learning_rate": 2.5595238095238093e-05, "loss": 1.4789, "step": 4100 }, { "epoch": 2.5, "grad_norm": 7.50277853012085, "learning_rate": 2.5e-05, "loss": 1.4296, "step": 4200 }, { "epoch": 2.5595238095238093, "grad_norm": 8.132680892944336, "learning_rate": 2.4404761904761906e-05, "loss": 1.4177, "step": 4300 }, { "epoch": 2.619047619047619, "grad_norm": 8.233142852783203, "learning_rate": 2.380952380952381e-05, "loss": 1.3953, "step": 4400 }, { "epoch": 2.678571428571429, "grad_norm": 7.6400628089904785, "learning_rate": 2.3214285714285715e-05, "loss": 1.4063, "step": 4500 }, { "epoch": 2.678571428571429, "eval_loss": 1.2739932537078857, "eval_runtime": 106.1766, "eval_samples_per_second": 112.511, "eval_steps_per_second": 1.761, "step": 4500 }, { "epoch": 2.738095238095238, "grad_norm": 7.416648864746094, "learning_rate": 2.261904761904762e-05, "loss": 1.4008, "step": 4600 }, { "epoch": 2.7976190476190474, "grad_norm": 7.283276081085205, "learning_rate": 2.2023809523809524e-05, "loss": 1.3818, "step": 4700 }, { "epoch": 2.857142857142857, "grad_norm": 8.360782623291016, "learning_rate": 2.1428571428571428e-05, "loss": 1.379, "step": 4800 }, { "epoch": 2.9166666666666665, "grad_norm": 9.960053443908691, "learning_rate": 2.0833333333333336e-05, "loss": 1.3479, "step": 4900 }, { "epoch": 2.9761904761904763, "grad_norm": 8.903584480285645, "learning_rate": 2.023809523809524e-05, "loss": 1.3389, "step": 5000 }, { "epoch": 2.9761904761904763, "eval_loss": 1.2187227010726929, "eval_runtime": 106.0745, "eval_samples_per_second": 112.619, "eval_steps_per_second": 1.763, "step": 5000 }, { "epoch": 3.0357142857142856, "grad_norm": 8.299568176269531, "learning_rate": 1.9642857142857145e-05, "loss": 1.3336, "step": 5100 }, { "epoch": 3.0952380952380953, "grad_norm": 8.194002151489258, "learning_rate": 1.9047619047619046e-05, "loss": 1.3102, "step": 5200 }, { "epoch": 3.1547619047619047, "grad_norm": 7.804888725280762, "learning_rate": 1.8452380952380954e-05, "loss": 1.3354, "step": 5300 }, { "epoch": 3.2142857142857144, "grad_norm": 7.510008811950684, "learning_rate": 1.785714285714286e-05, "loss": 1.3382, "step": 5400 }, { "epoch": 3.2738095238095237, "grad_norm": 7.9406023025512695, "learning_rate": 1.7261904761904763e-05, "loss": 1.2996, "step": 5500 }, { "epoch": 3.2738095238095237, "eval_loss": 1.2163845300674438, "eval_runtime": 106.2374, "eval_samples_per_second": 112.446, "eval_steps_per_second": 1.76, "step": 5500 }, { "epoch": 3.3333333333333335, "grad_norm": 7.935986042022705, "learning_rate": 1.6666666666666667e-05, "loss": 1.2889, "step": 5600 }, { "epoch": 3.392857142857143, "grad_norm": 7.514946937561035, "learning_rate": 1.6071428571428572e-05, "loss": 1.3353, "step": 5700 }, { "epoch": 3.4523809523809526, "grad_norm": 8.081690788269043, "learning_rate": 1.5476190476190476e-05, "loss": 1.2959, "step": 5800 }, { "epoch": 3.511904761904762, "grad_norm": 8.02477741241455, "learning_rate": 1.4880952380952381e-05, "loss": 1.2467, "step": 5900 }, { "epoch": 3.571428571428571, "grad_norm": 8.462018013000488, "learning_rate": 1.4285714285714285e-05, "loss": 1.2899, "step": 6000 }, { "epoch": 3.571428571428571, "eval_loss": 1.1731750965118408, "eval_runtime": 106.1002, "eval_samples_per_second": 112.592, "eval_steps_per_second": 1.762, "step": 6000 }, { "epoch": 3.630952380952381, "grad_norm": 7.650486469268799, "learning_rate": 1.3690476190476192e-05, "loss": 1.2878, "step": 6100 }, { "epoch": 3.6904761904761907, "grad_norm": 8.889339447021484, "learning_rate": 1.3095238095238096e-05, "loss": 1.2704, "step": 6200 }, { "epoch": 3.75, "grad_norm": 9.502975463867188, "learning_rate": 1.25e-05, "loss": 1.2406, "step": 6300 }, { "epoch": 3.8095238095238093, "grad_norm": 9.480072975158691, "learning_rate": 1.1904761904761905e-05, "loss": 1.2707, "step": 6400 }, { "epoch": 3.869047619047619, "grad_norm": 7.378728866577148, "learning_rate": 1.130952380952381e-05, "loss": 1.2343, "step": 6500 }, { "epoch": 3.869047619047619, "eval_loss": 1.1631048917770386, "eval_runtime": 106.023, "eval_samples_per_second": 112.674, "eval_steps_per_second": 1.764, "step": 6500 }, { "epoch": 3.928571428571429, "grad_norm": 8.64068603515625, "learning_rate": 1.0714285714285714e-05, "loss": 1.2478, "step": 6600 }, { "epoch": 3.988095238095238, "grad_norm": 8.00733470916748, "learning_rate": 1.011904761904762e-05, "loss": 1.2414, "step": 6700 }, { "epoch": 4.0476190476190474, "grad_norm": 7.568075180053711, "learning_rate": 9.523809523809523e-06, "loss": 1.2533, "step": 6800 }, { "epoch": 4.107142857142857, "grad_norm": 7.190471649169922, "learning_rate": 8.92857142857143e-06, "loss": 1.2495, "step": 6900 }, { "epoch": 4.166666666666667, "grad_norm": 8.588766098022461, "learning_rate": 8.333333333333334e-06, "loss": 1.2095, "step": 7000 }, { "epoch": 4.166666666666667, "eval_loss": 1.0942662954330444, "eval_runtime": 106.0638, "eval_samples_per_second": 112.63, "eval_steps_per_second": 1.763, "step": 7000 }, { "epoch": 4.226190476190476, "grad_norm": 8.47437858581543, "learning_rate": 7.738095238095238e-06, "loss": 1.2134, "step": 7100 }, { "epoch": 4.285714285714286, "grad_norm": 7.481777191162109, "learning_rate": 7.142857142857143e-06, "loss": 1.1873, "step": 7200 }, { "epoch": 4.345238095238095, "grad_norm": 7.533321380615234, "learning_rate": 6.547619047619048e-06, "loss": 1.2022, "step": 7300 }, { "epoch": 4.404761904761905, "grad_norm": 8.106829643249512, "learning_rate": 5.9523809523809525e-06, "loss": 1.2162, "step": 7400 }, { "epoch": 4.464285714285714, "grad_norm": 7.893404483795166, "learning_rate": 5.357142857142857e-06, "loss": 1.2078, "step": 7500 }, { "epoch": 4.464285714285714, "eval_loss": 1.1191558837890625, "eval_runtime": 106.3099, "eval_samples_per_second": 112.37, "eval_steps_per_second": 1.759, "step": 7500 }, { "epoch": 4.523809523809524, "grad_norm": 8.90477466583252, "learning_rate": 4.7619047619047615e-06, "loss": 1.219, "step": 7600 }, { "epoch": 4.583333333333333, "grad_norm": 7.603702545166016, "learning_rate": 4.166666666666667e-06, "loss": 1.2, "step": 7700 }, { "epoch": 4.642857142857143, "grad_norm": 8.275321006774902, "learning_rate": 3.5714285714285714e-06, "loss": 1.1985, "step": 7800 }, { "epoch": 4.7023809523809526, "grad_norm": 8.652661323547363, "learning_rate": 2.9761904761904763e-06, "loss": 1.2421, "step": 7900 }, { "epoch": 4.761904761904762, "grad_norm": 8.569392204284668, "learning_rate": 2.3809523809523808e-06, "loss": 1.208, "step": 8000 }, { "epoch": 4.761904761904762, "eval_loss": 1.091291069984436, "eval_runtime": 106.5156, "eval_samples_per_second": 112.153, "eval_steps_per_second": 1.756, "step": 8000 }, { "epoch": 4.821428571428571, "grad_norm": 8.579488754272461, "learning_rate": 1.7857142857142857e-06, "loss": 1.2274, "step": 8100 }, { "epoch": 4.880952380952381, "grad_norm": 9.347880363464355, "learning_rate": 1.1904761904761904e-06, "loss": 1.1833, "step": 8200 }, { "epoch": 4.940476190476191, "grad_norm": 7.7748894691467285, "learning_rate": 5.952380952380952e-07, "loss": 1.1919, "step": 8300 }, { "epoch": 5.0, "grad_norm": 8.465712547302246, "learning_rate": 0.0, "loss": 1.2069, "step": 8400 } ], "logging_steps": 100, "max_steps": 8400, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.5633072459126792e+16, "train_batch_size": 64, "trial_name": null, "trial_params": null }