diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..30744be852d54f33849e109fc945c3bd88091ce9 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +checkpoint-129072/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-215120/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-258144/tokenizer.json filter=lfs diff=lfs merge=lfs -text +generated_predictions.txt filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..9dcc43b15adbdca7c7adc8fc4ea5271fdd2abc2c --- /dev/null +++ b/README.md @@ -0,0 +1,75 @@ +--- +language: +- ja +- ko +base_model: facebook/mbart-large-50-many-to-many-mmt +tags: +- generated_from_trainer +metrics: +- bleu +model-index: +- name: enko_mbartLarge_100p_sup2 + results: [] +--- + + + +# enko_mbartLarge_100p_sup2 + +This model is a fine-tuned version of [facebook/mbart-large-50-many-to-many-mmt](https://huggingface.co/facebook/mbart-large-50-many-to-many-mmt) on an unknown dataset. +It achieves the following results on the evaluation set: +- Loss: 0.6417 +- Bleu: 59.1835 +- Gen Len: 15.7226 + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 5e-05 +- train_batch_size: 2 +- eval_batch_size: 2 +- seed: 42 +- distributed_type: multi-GPU +- num_devices: 4 +- gradient_accumulation_steps: 2 +- total_train_batch_size: 16 +- total_eval_batch_size: 8 +- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08 +- lr_scheduler_type: linear +- lr_scheduler_warmup_steps: 2500 +- num_epochs: 15 +- mixed_precision_training: Native AMP + +### Training results + +| Training Loss | Epoch | Step | Validation Loss | Bleu | Gen Len | +|:-------------:|:-----:|:------:|:---------------:|:-------:|:-------:| +| 0.7676 | 1.0 | 43024 | 0.7125 | 55.2526 | 16.382 | +| 0.6349 | 2.0 | 86048 | 0.6547 | 58.202 | 15.9466 | +| 0.537 | 3.0 | 129072 | 0.6417 | 59.1835 | 15.7226 | +| 0.434 | 4.0 | 172096 | 0.6589 | 59.6194 | 15.702 | +| 0.3504 | 5.0 | 215120 | 0.7117 | 59.352 | 15.7454 | +| 0.2799 | 6.0 | 258144 | 0.7784 | 59.2034 | 15.6702 | + + +### Framework versions + +- Transformers 4.37.2 +- Pytorch 2.2.0+cu121 +- Datasets 2.16.1 +- Tokenizers 0.15.1 diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000000000000000000000000000000000000..091e21432e340e0489a4e986efa7eda05494f9f7 --- /dev/null +++ b/all_results.json @@ -0,0 +1,22 @@ +{ + "epoch": 6.0, + "eval_bleu": 59.1835, + "eval_gen_len": 15.7226, + "eval_loss": 0.6417234539985657, + "eval_runtime": 6644.884, + "eval_samples": 85914, + "eval_samples_per_second": 12.929, + "eval_steps_per_second": 1.616, + "predict_bleu": 59.4091, + "predict_gen_len": 16.1979, + "predict_loss": 0.6332442164421082, + "predict_runtime": 7442.4048, + "predict_samples": 94835, + "predict_samples_per_second": 12.743, + "predict_steps_per_second": 1.593, + "train_loss": 0.5174947596547979, + "train_runtime": 240022.8696, + "train_samples": 688378, + "train_samples_per_second": 43.02, + "train_steps_per_second": 2.689 +} \ No newline at end of file diff --git a/checkpoint-129072/config.json b/checkpoint-129072/config.json new file mode 100644 index 0000000000000000000000000000000000000000..77b8612fc6bf9e6aaa7560c8bcce3e4c9c6986de --- /dev/null +++ b/checkpoint-129072/config.json @@ -0,0 +1,59 @@ +{ + "_name_or_path": "facebook/mbart-large-50-many-to-many-mmt", + "_num_labels": 3, + "activation_dropout": 0.0, + "activation_function": "relu", + "add_bias_logits": false, + "add_final_layer_norm": true, + "architectures": [ + "MBartForConditionalGeneration" + ], + "attention_dropout": 0.0, + "bos_token_id": 0, + "classif_dropout": 0.0, + "classifier_dropout": 0.0, + "d_model": 1024, + "decoder_attention_heads": 16, + "decoder_ffn_dim": 4096, + "decoder_layerdrop": 0.0, + "decoder_layers": 12, + "decoder_start_token_id": 2, + "dropout": 0.1, + "early_stopping": true, + "encoder_attention_heads": 16, + "encoder_ffn_dim": 4096, + "encoder_layerdrop": 0.0, + "encoder_layers": 12, + "eos_token_id": 2, + "forced_bos_token_id": 250014, + "forced_eos_token_id": 2, + "gradient_checkpointing": false, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1", + "2": "LABEL_2" + }, + "init_std": 0.02, + "is_encoder_decoder": true, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1, + "LABEL_2": 2 + }, + "max_length": 200, + "max_position_embeddings": 1024, + "model_type": "mbart", + "normalize_before": true, + "normalize_embedding": true, + "num_beams": 5, + "num_hidden_layers": 12, + "output_past": true, + "pad_token_id": 1, + "scale_embedding": true, + "static_position_embeddings": false, + "tokenizer_class": "MBart50Tokenizer", + "torch_dtype": "float32", + "transformers_version": "4.37.2", + "use_cache": true, + "vocab_size": 250054 +} diff --git a/checkpoint-129072/generation_config.json b/checkpoint-129072/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b1d0b25bc2cc841a32d579c94d80eb6323756b7a --- /dev/null +++ b/checkpoint-129072/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 0, + "decoder_start_token_id": 2, + "early_stopping": true, + "eos_token_id": 2, + "forced_bos_token_id": 250014, + "forced_eos_token_id": 2, + "max_length": 200, + "num_beams": 5, + "pad_token_id": 1, + "transformers_version": "4.37.2" +} diff --git a/checkpoint-129072/model.safetensors b/checkpoint-129072/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4de350a5baef79d3b3285908a7478968aa943d99 --- /dev/null +++ b/checkpoint-129072/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce8ee05becd6c68dae4da8f2188129cca72d4a29e3fc1ff98ff891ac907f3f7b +size 2444578688 diff --git a/checkpoint-129072/optimizer.pt b/checkpoint-129072/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..a047bd11e53e5579e27e6f16085dd3245add243a --- /dev/null +++ b/checkpoint-129072/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a82d551da8a0f37a16ada04748f9a92def4edb437be29c6accff61a261e8e77c +size 4887473903 diff --git a/checkpoint-129072/rng_state_0.pth b/checkpoint-129072/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..0c1675482c4623a23dd47e0f03edfc2ba3210e48 --- /dev/null +++ b/checkpoint-129072/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a89ccfb8db24468223ae126eebfa6d5e097c8dd13680317b080f746427291bc +size 15024 diff --git a/checkpoint-129072/rng_state_1.pth b/checkpoint-129072/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..a217b21f44c5048e4f72ab97a3a1ecc981b65e86 --- /dev/null +++ b/checkpoint-129072/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db427607245c0534cd6c7dd82183a6a1263d161f095c2911cfff67caf3bb4bf3 +size 15024 diff --git a/checkpoint-129072/rng_state_2.pth b/checkpoint-129072/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..b93a435c192f56331872c84b3194b60b0ff36648 --- /dev/null +++ b/checkpoint-129072/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcc51f2f63ba2ead4c24b97881b50e119b52b59d4b35cfbc2ec3507a214971fe +size 15024 diff --git a/checkpoint-129072/rng_state_3.pth b/checkpoint-129072/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..43bfea95c4d77b73af53cbe99364e2bd09611f66 --- /dev/null +++ b/checkpoint-129072/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c33e60aae1caec053054513947718af09a67619013c3ccc20789c4e2191de7e +size 15024 diff --git a/checkpoint-129072/scheduler.pt b/checkpoint-129072/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..0db7c1614cb55d592fe350e5f875d7668daaa153 --- /dev/null +++ b/checkpoint-129072/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06ad2796312a4d055bf71a1a39de952d4904a95514a42f7d333435157391b3c2 +size 1064 diff --git a/checkpoint-129072/sentencepiece.bpe.model b/checkpoint-129072/sentencepiece.bpe.model new file mode 100644 index 0000000000000000000000000000000000000000..7a3f40a75f870bc1f21700cd414dc2acc431583c --- /dev/null +++ b/checkpoint-129072/sentencepiece.bpe.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865 +size 5069051 diff --git a/checkpoint-129072/special_tokens_map.json b/checkpoint-129072/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..92619141640d5fcbb4429807de2248352b0dca79 --- /dev/null +++ b/checkpoint-129072/special_tokens_map.json @@ -0,0 +1,69 @@ +{ + "additional_special_tokens": [ + "ar_AR", + "cs_CZ", + "de_DE", + "en_XX", + "es_XX", + "et_EE", + "fi_FI", + "fr_XX", + "gu_IN", + "hi_IN", + "it_IT", + "ja_XX", + "kk_KZ", + "ko_KR", + "lt_LT", + "lv_LV", + "my_MM", + "ne_NP", + "nl_XX", + "ro_RO", + "ru_RU", + "si_LK", + "tr_TR", + "vi_VN", + "zh_CN", + "af_ZA", + "az_AZ", + "bn_IN", + "fa_IR", + "he_IL", + "hr_HR", + "id_ID", + "ka_GE", + "km_KH", + "mk_MK", + "ml_IN", + "mn_MN", + "mr_IN", + "pl_PL", + "ps_AF", + "pt_XX", + "sv_SE", + "sw_KE", + "ta_IN", + "te_IN", + "th_TH", + "tl_XX", + "uk_UA", + "ur_PK", + "xh_ZA", + "gl_ES", + "sl_SI" + ], + "bos_token": "", + "cls_token": "", + "eos_token": "", + "mask_token": { + "content": "", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "sep_token": "", + "unk_token": "" +} diff --git a/checkpoint-129072/tokenizer.json b/checkpoint-129072/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..ecc6a4f3075bc2a01607c72e81fd24456ab68311 --- /dev/null +++ b/checkpoint-129072/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfb9b1f3e7ce9f6c1a5ab4560578eda3329db396be400909c5d34c8d0b08b0ed +size 17110208 diff --git a/checkpoint-129072/tokenizer_config.json b/checkpoint-129072/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..70c0515a5815fcc727e11e053116348bfac12128 --- /dev/null +++ b/checkpoint-129072/tokenizer_config.json @@ -0,0 +1,528 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250001": { + "content": "ar_AR", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250002": { + "content": "cs_CZ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250003": { + "content": "de_DE", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250004": { + "content": "en_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250005": { + "content": "es_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250006": { + "content": "et_EE", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250007": { + "content": "fi_FI", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250008": { + "content": "fr_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250009": { + "content": "gu_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250010": { + "content": "hi_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250011": { + "content": "it_IT", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250012": { + "content": "ja_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250013": { + "content": "kk_KZ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250014": { + "content": "ko_KR", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250015": { + "content": "lt_LT", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250016": { + "content": "lv_LV", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250017": { + "content": "my_MM", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250018": { + "content": "ne_NP", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250019": { + "content": "nl_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250020": { + "content": "ro_RO", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250021": { + "content": "ru_RU", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250022": { + "content": "si_LK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250023": { + "content": "tr_TR", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250024": { + "content": "vi_VN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250025": { + "content": "zh_CN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250026": { + "content": "af_ZA", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250027": { + "content": "az_AZ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250028": { + "content": "bn_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250029": { + "content": "fa_IR", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250030": { + "content": "he_IL", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250031": { + "content": "hr_HR", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250032": { + "content": "id_ID", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250033": { + "content": "ka_GE", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250034": { + "content": "km_KH", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250035": { + "content": "mk_MK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250036": { + "content": "ml_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250037": { + "content": "mn_MN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250038": { + "content": "mr_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250039": { + "content": "pl_PL", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250040": { + "content": "ps_AF", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250041": { + "content": "pt_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250042": { + "content": "sv_SE", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250043": { + "content": "sw_KE", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250044": { + "content": "ta_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250045": { + "content": "te_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250046": { + "content": "th_TH", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250047": { + "content": "tl_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250048": { + "content": "uk_UA", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250049": { + "content": "ur_PK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250050": { + "content": "xh_ZA", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250051": { + "content": "gl_ES", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250052": { + "content": "sl_SI", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250053": { + "content": "", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "ar_AR", + "cs_CZ", + "de_DE", + "en_XX", + "es_XX", + "et_EE", + "fi_FI", + "fr_XX", + "gu_IN", + "hi_IN", + "it_IT", + "ja_XX", + "kk_KZ", + "ko_KR", + "lt_LT", + "lv_LV", + "my_MM", + "ne_NP", + "nl_XX", + "ro_RO", + "ru_RU", + "si_LK", + "tr_TR", + "vi_VN", + "zh_CN", + "af_ZA", + "az_AZ", + "bn_IN", + "fa_IR", + "he_IL", + "hr_HR", + "id_ID", + "ka_GE", + "km_KH", + "mk_MK", + "ml_IN", + "mn_MN", + "mr_IN", + "pl_PL", + "ps_AF", + "pt_XX", + "sv_SE", + "sw_KE", + "ta_IN", + "te_IN", + "th_TH", + "tl_XX", + "uk_UA", + "ur_PK", + "xh_ZA", + "gl_ES", + "sl_SI" + ], + "bos_token": "", + "clean_up_tokenization_spaces": true, + "cls_token": "", + "eos_token": "", + "language_codes": "ML50", + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sep_token": "", + "sp_model_kwargs": {}, + "src_lang": "ja_XX", + "tgt_lang": "ko_KR", + "tokenizer_class": "MBart50Tokenizer", + "unk_token": "" +} diff --git a/checkpoint-129072/trainer_state.json b/checkpoint-129072/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6bdd830b77bd5780a246945fb94bebfed1993108 --- /dev/null +++ b/checkpoint-129072/trainer_state.json @@ -0,0 +1,1599 @@ +{ + "best_metric": 0.6417234539985657, + "best_model_checkpoint": "./enko_mbartLarge_100p_sup2/checkpoint-129072", + "epoch": 3.0, + "eval_steps": 500, + "global_step": 129072, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 9.940000000000001e-06, + "loss": 2.2901, + "step": 500 + }, + { + "epoch": 0.02, + "learning_rate": 1.992e-05, + "loss": 1.6084, + "step": 1000 + }, + { + "epoch": 0.03, + "learning_rate": 2.9920000000000005e-05, + "loss": 1.449, + "step": 1500 + }, + { + "epoch": 0.05, + "learning_rate": 3.9920000000000004e-05, + "loss": 1.3874, + "step": 2000 + }, + { + "epoch": 0.06, + "learning_rate": 4.992e-05, + "loss": 1.3485, + "step": 2500 + }, + { + "epoch": 0.07, + "learning_rate": 4.996142239367825e-05, + "loss": 1.2926, + "step": 3000 + }, + { + "epoch": 0.08, + "learning_rate": 4.9922533677628105e-05, + "loss": 1.248, + "step": 3500 + }, + { + "epoch": 0.09, + "learning_rate": 4.988364496157795e-05, + "loss": 1.1931, + "step": 4000 + }, + { + "epoch": 0.1, + "learning_rate": 4.98447562455278e-05, + "loss": 1.1677, + "step": 4500 + }, + { + "epoch": 0.12, + "learning_rate": 4.980594530690975e-05, + "loss": 1.142, + "step": 5000 + }, + { + "epoch": 0.13, + "learning_rate": 4.97670565908596e-05, + "loss": 1.122, + "step": 5500 + }, + { + "epoch": 0.14, + "learning_rate": 4.972816787480945e-05, + "loss": 1.0855, + "step": 6000 + }, + { + "epoch": 0.15, + "learning_rate": 4.968927915875929e-05, + "loss": 1.0719, + "step": 6500 + }, + { + "epoch": 0.16, + "learning_rate": 4.965039044270914e-05, + "loss": 1.053, + "step": 7000 + }, + { + "epoch": 0.17, + "learning_rate": 4.9611501726658995e-05, + "loss": 1.0359, + "step": 7500 + }, + { + "epoch": 0.19, + "learning_rate": 4.957261301060884e-05, + "loss": 1.0232, + "step": 8000 + }, + { + "epoch": 0.2, + "learning_rate": 4.953380207199079e-05, + "loss": 1.0169, + "step": 8500 + }, + { + "epoch": 0.21, + "learning_rate": 4.949491335594064e-05, + "loss": 1.0141, + "step": 9000 + }, + { + "epoch": 0.22, + "learning_rate": 4.945602463989049e-05, + "loss": 0.9855, + "step": 9500 + }, + { + "epoch": 0.23, + "learning_rate": 4.941721370127244e-05, + "loss": 0.9932, + "step": 10000 + }, + { + "epoch": 0.24, + "learning_rate": 4.9378324985222293e-05, + "loss": 0.9656, + "step": 10500 + }, + { + "epoch": 0.26, + "learning_rate": 4.933943626917214e-05, + "loss": 0.9694, + "step": 11000 + }, + { + "epoch": 0.27, + "learning_rate": 4.930054755312199e-05, + "loss": 0.9609, + "step": 11500 + }, + { + "epoch": 0.28, + "learning_rate": 4.926165883707184e-05, + "loss": 0.9526, + "step": 12000 + }, + { + "epoch": 0.29, + "learning_rate": 4.9222770121021686e-05, + "loss": 0.9359, + "step": 12500 + }, + { + "epoch": 0.3, + "learning_rate": 4.9183959182403636e-05, + "loss": 0.9324, + "step": 13000 + }, + { + "epoch": 0.31, + "learning_rate": 4.914507046635349e-05, + "loss": 0.9251, + "step": 13500 + }, + { + "epoch": 0.33, + "learning_rate": 4.910625952773543e-05, + "loss": 0.9216, + "step": 14000 + }, + { + "epoch": 0.34, + "learning_rate": 4.906737081168528e-05, + "loss": 0.9181, + "step": 14500 + }, + { + "epoch": 0.35, + "learning_rate": 4.902855987306723e-05, + "loss": 0.9098, + "step": 15000 + }, + { + "epoch": 0.36, + "learning_rate": 4.898967115701708e-05, + "loss": 0.9067, + "step": 15500 + }, + { + "epoch": 0.37, + "learning_rate": 4.895078244096693e-05, + "loss": 0.8931, + "step": 16000 + }, + { + "epoch": 0.38, + "learning_rate": 4.891189372491678e-05, + "loss": 0.8923, + "step": 16500 + }, + { + "epoch": 0.4, + "learning_rate": 4.887300500886663e-05, + "loss": 0.8703, + "step": 17000 + }, + { + "epoch": 0.41, + "learning_rate": 4.8834116292816475e-05, + "loss": 0.8861, + "step": 17500 + }, + { + "epoch": 0.42, + "learning_rate": 4.879522757676633e-05, + "loss": 0.8864, + "step": 18000 + }, + { + "epoch": 0.43, + "learning_rate": 4.875633886071618e-05, + "loss": 0.886, + "step": 18500 + }, + { + "epoch": 0.44, + "learning_rate": 4.871752792209813e-05, + "loss": 0.8737, + "step": 19000 + }, + { + "epoch": 0.45, + "learning_rate": 4.867863920604798e-05, + "loss": 0.8708, + "step": 19500 + }, + { + "epoch": 0.46, + "learning_rate": 4.8639750489997824e-05, + "loss": 0.865, + "step": 20000 + }, + { + "epoch": 0.48, + "learning_rate": 4.8600861773947676e-05, + "loss": 0.8571, + "step": 20500 + }, + { + "epoch": 0.49, + "learning_rate": 4.856197305789753e-05, + "loss": 0.8621, + "step": 21000 + }, + { + "epoch": 0.5, + "learning_rate": 4.852308434184737e-05, + "loss": 0.8607, + "step": 21500 + }, + { + "epoch": 0.51, + "learning_rate": 4.848419562579722e-05, + "loss": 0.8519, + "step": 22000 + }, + { + "epoch": 0.52, + "learning_rate": 4.844530690974707e-05, + "loss": 0.8472, + "step": 22500 + }, + { + "epoch": 0.53, + "learning_rate": 4.840641819369692e-05, + "loss": 0.8381, + "step": 23000 + }, + { + "epoch": 0.55, + "learning_rate": 4.8367529477646765e-05, + "loss": 0.8329, + "step": 23500 + }, + { + "epoch": 0.56, + "learning_rate": 4.8328640761596616e-05, + "loss": 0.8425, + "step": 24000 + }, + { + "epoch": 0.57, + "learning_rate": 4.828975204554647e-05, + "loss": 0.8227, + "step": 24500 + }, + { + "epoch": 0.58, + "learning_rate": 4.825086332949631e-05, + "loss": 0.8349, + "step": 25000 + }, + { + "epoch": 0.59, + "learning_rate": 4.8211974613446164e-05, + "loss": 0.8235, + "step": 25500 + }, + { + "epoch": 0.6, + "learning_rate": 4.8173085897396015e-05, + "loss": 0.8116, + "step": 26000 + }, + { + "epoch": 0.62, + "learning_rate": 4.813419718134587e-05, + "loss": 0.8367, + "step": 26500 + }, + { + "epoch": 0.63, + "learning_rate": 4.809530846529571e-05, + "loss": 0.8219, + "step": 27000 + }, + { + "epoch": 0.64, + "learning_rate": 4.805641974924556e-05, + "loss": 0.8206, + "step": 27500 + }, + { + "epoch": 0.65, + "learning_rate": 4.8017531033195415e-05, + "loss": 0.8199, + "step": 28000 + }, + { + "epoch": 0.66, + "learning_rate": 4.797864231714526e-05, + "loss": 0.816, + "step": 28500 + }, + { + "epoch": 0.67, + "learning_rate": 4.793983137852721e-05, + "loss": 0.8144, + "step": 29000 + }, + { + "epoch": 0.69, + "learning_rate": 4.790102043990916e-05, + "loss": 0.806, + "step": 29500 + }, + { + "epoch": 0.7, + "learning_rate": 4.786220950129111e-05, + "loss": 0.8134, + "step": 30000 + }, + { + "epoch": 0.71, + "learning_rate": 4.782332078524095e-05, + "loss": 0.8109, + "step": 30500 + }, + { + "epoch": 0.72, + "learning_rate": 4.7784432069190805e-05, + "loss": 0.792, + "step": 31000 + }, + { + "epoch": 0.73, + "learning_rate": 4.7745543353140656e-05, + "loss": 0.8046, + "step": 31500 + }, + { + "epoch": 0.74, + "learning_rate": 4.77066546370905e-05, + "loss": 0.7983, + "step": 32000 + }, + { + "epoch": 0.76, + "learning_rate": 4.766776592104035e-05, + "loss": 0.7937, + "step": 32500 + }, + { + "epoch": 0.77, + "learning_rate": 4.7628877204990204e-05, + "loss": 0.7978, + "step": 33000 + }, + { + "epoch": 0.78, + "learning_rate": 4.758998848894005e-05, + "loss": 0.7961, + "step": 33500 + }, + { + "epoch": 0.79, + "learning_rate": 4.75510997728899e-05, + "loss": 0.7854, + "step": 34000 + }, + { + "epoch": 0.8, + "learning_rate": 4.751221105683975e-05, + "loss": 0.8006, + "step": 34500 + }, + { + "epoch": 0.81, + "learning_rate": 4.7473322340789597e-05, + "loss": 0.7857, + "step": 35000 + }, + { + "epoch": 0.83, + "learning_rate": 4.743443362473945e-05, + "loss": 0.7857, + "step": 35500 + }, + { + "epoch": 0.84, + "learning_rate": 4.73956226861214e-05, + "loss": 0.7874, + "step": 36000 + }, + { + "epoch": 0.85, + "learning_rate": 4.735673397007125e-05, + "loss": 0.7826, + "step": 36500 + }, + { + "epoch": 0.86, + "learning_rate": 4.73178452540211e-05, + "loss": 0.7902, + "step": 37000 + }, + { + "epoch": 0.87, + "learning_rate": 4.727895653797094e-05, + "loss": 0.7695, + "step": 37500 + }, + { + "epoch": 0.88, + "learning_rate": 4.724006782192079e-05, + "loss": 0.7789, + "step": 38000 + }, + { + "epoch": 0.89, + "learning_rate": 4.720117910587064e-05, + "loss": 0.7739, + "step": 38500 + }, + { + "epoch": 0.91, + "learning_rate": 4.716229038982049e-05, + "loss": 0.7726, + "step": 39000 + }, + { + "epoch": 0.92, + "learning_rate": 4.712340167377034e-05, + "loss": 0.7722, + "step": 39500 + }, + { + "epoch": 0.93, + "learning_rate": 4.708451295772019e-05, + "loss": 0.7656, + "step": 40000 + }, + { + "epoch": 0.94, + "learning_rate": 4.704562424167004e-05, + "loss": 0.77, + "step": 40500 + }, + { + "epoch": 0.95, + "learning_rate": 4.700689108048409e-05, + "loss": 0.7814, + "step": 41000 + }, + { + "epoch": 0.96, + "learning_rate": 4.696800236443394e-05, + "loss": 0.7704, + "step": 41500 + }, + { + "epoch": 0.98, + "learning_rate": 4.692919142581589e-05, + "loss": 0.7611, + "step": 42000 + }, + { + "epoch": 0.99, + "learning_rate": 4.689038048719784e-05, + "loss": 0.7629, + "step": 42500 + }, + { + "epoch": 1.0, + "learning_rate": 4.6851491771147685e-05, + "loss": 0.7676, + "step": 43000 + }, + { + "epoch": 1.0, + "eval_bleu": 55.2526, + "eval_gen_len": 16.382, + "eval_loss": 0.7125306129455566, + "eval_runtime": 7352.67, + "eval_samples_per_second": 11.685, + "eval_steps_per_second": 1.461, + "step": 43024 + }, + { + "epoch": 1.01, + "learning_rate": 4.6812603055097536e-05, + "loss": 0.6434, + "step": 43500 + }, + { + "epoch": 1.02, + "learning_rate": 4.6773792116479486e-05, + "loss": 0.6436, + "step": 44000 + }, + { + "epoch": 1.03, + "learning_rate": 4.673490340042934e-05, + "loss": 0.651, + "step": 44500 + }, + { + "epoch": 1.05, + "learning_rate": 4.669601468437918e-05, + "loss": 0.6517, + "step": 45000 + }, + { + "epoch": 1.06, + "learning_rate": 4.665712596832903e-05, + "loss": 0.6416, + "step": 45500 + }, + { + "epoch": 1.07, + "learning_rate": 4.661823725227888e-05, + "loss": 0.6517, + "step": 46000 + }, + { + "epoch": 1.08, + "learning_rate": 4.657934853622873e-05, + "loss": 0.6539, + "step": 46500 + }, + { + "epoch": 1.09, + "learning_rate": 4.654045982017858e-05, + "loss": 0.6587, + "step": 47000 + }, + { + "epoch": 1.1, + "learning_rate": 4.6501571104128426e-05, + "loss": 0.6518, + "step": 47500 + }, + { + "epoch": 1.12, + "learning_rate": 4.646268238807828e-05, + "loss": 0.6465, + "step": 48000 + }, + { + "epoch": 1.13, + "learning_rate": 4.642379367202813e-05, + "loss": 0.6437, + "step": 48500 + }, + { + "epoch": 1.14, + "learning_rate": 4.6384904955977974e-05, + "loss": 0.6458, + "step": 49000 + }, + { + "epoch": 1.15, + "learning_rate": 4.6346016239927825e-05, + "loss": 0.6476, + "step": 49500 + }, + { + "epoch": 1.16, + "learning_rate": 4.630712752387768e-05, + "loss": 0.6587, + "step": 50000 + }, + { + "epoch": 1.17, + "learning_rate": 4.626823880782752e-05, + "loss": 0.6392, + "step": 50500 + }, + { + "epoch": 1.19, + "learning_rate": 4.622935009177737e-05, + "loss": 0.6547, + "step": 51000 + }, + { + "epoch": 1.2, + "learning_rate": 4.6190461375727225e-05, + "loss": 0.6511, + "step": 51500 + }, + { + "epoch": 1.21, + "learning_rate": 4.615157265967707e-05, + "loss": 0.6522, + "step": 52000 + }, + { + "epoch": 1.22, + "learning_rate": 4.611268394362692e-05, + "loss": 0.6555, + "step": 52500 + }, + { + "epoch": 1.23, + "learning_rate": 4.607395078244097e-05, + "loss": 0.6461, + "step": 53000 + }, + { + "epoch": 1.24, + "learning_rate": 4.6035062066390814e-05, + "loss": 0.6471, + "step": 53500 + }, + { + "epoch": 1.26, + "learning_rate": 4.5996173350340665e-05, + "loss": 0.6563, + "step": 54000 + }, + { + "epoch": 1.27, + "learning_rate": 4.5957284634290517e-05, + "loss": 0.6447, + "step": 54500 + }, + { + "epoch": 1.28, + "learning_rate": 4.591839591824036e-05, + "loss": 0.6483, + "step": 55000 + }, + { + "epoch": 1.29, + "learning_rate": 4.587958497962231e-05, + "loss": 0.6475, + "step": 55500 + }, + { + "epoch": 1.3, + "learning_rate": 4.584069626357216e-05, + "loss": 0.6546, + "step": 56000 + }, + { + "epoch": 1.31, + "learning_rate": 4.5801807547522014e-05, + "loss": 0.6528, + "step": 56500 + }, + { + "epoch": 1.32, + "learning_rate": 4.5762918831471866e-05, + "loss": 0.6538, + "step": 57000 + }, + { + "epoch": 1.34, + "learning_rate": 4.572403011542171e-05, + "loss": 0.6511, + "step": 57500 + }, + { + "epoch": 1.35, + "learning_rate": 4.568521917680366e-05, + "loss": 0.6457, + "step": 58000 + }, + { + "epoch": 1.36, + "learning_rate": 4.564633046075351e-05, + "loss": 0.659, + "step": 58500 + }, + { + "epoch": 1.37, + "learning_rate": 4.560744174470336e-05, + "loss": 0.6358, + "step": 59000 + }, + { + "epoch": 1.38, + "learning_rate": 4.556855302865321e-05, + "loss": 0.6425, + "step": 59500 + }, + { + "epoch": 1.39, + "learning_rate": 4.552966431260306e-05, + "loss": 0.644, + "step": 60000 + }, + { + "epoch": 1.41, + "learning_rate": 4.5490775596552904e-05, + "loss": 0.6537, + "step": 60500 + }, + { + "epoch": 1.42, + "learning_rate": 4.5451886880502756e-05, + "loss": 0.6529, + "step": 61000 + }, + { + "epoch": 1.43, + "learning_rate": 4.5413075941884705e-05, + "loss": 0.6547, + "step": 61500 + }, + { + "epoch": 1.44, + "learning_rate": 4.537418722583455e-05, + "loss": 0.6443, + "step": 62000 + }, + { + "epoch": 1.45, + "learning_rate": 4.53352985097844e-05, + "loss": 0.6516, + "step": 62500 + }, + { + "epoch": 1.46, + "learning_rate": 4.529640979373425e-05, + "loss": 0.6439, + "step": 63000 + }, + { + "epoch": 1.48, + "learning_rate": 4.52575210776841e-05, + "loss": 0.6401, + "step": 63500 + }, + { + "epoch": 1.49, + "learning_rate": 4.521863236163395e-05, + "loss": 0.6545, + "step": 64000 + }, + { + "epoch": 1.5, + "learning_rate": 4.51797436455838e-05, + "loss": 0.6433, + "step": 64500 + }, + { + "epoch": 1.51, + "learning_rate": 4.514085492953365e-05, + "loss": 0.6538, + "step": 65000 + }, + { + "epoch": 1.52, + "learning_rate": 4.51019662134835e-05, + "loss": 0.6548, + "step": 65500 + }, + { + "epoch": 1.53, + "learning_rate": 4.506307749743335e-05, + "loss": 0.6458, + "step": 66000 + }, + { + "epoch": 1.55, + "learning_rate": 4.50241887813832e-05, + "loss": 0.649, + "step": 66500 + }, + { + "epoch": 1.56, + "learning_rate": 4.4985300065333045e-05, + "loss": 0.6471, + "step": 67000 + }, + { + "epoch": 1.57, + "learning_rate": 4.4946411349282896e-05, + "loss": 0.6526, + "step": 67500 + }, + { + "epoch": 1.58, + "learning_rate": 4.4907600410664846e-05, + "loss": 0.646, + "step": 68000 + }, + { + "epoch": 1.59, + "learning_rate": 4.486878947204679e-05, + "loss": 0.6491, + "step": 68500 + }, + { + "epoch": 1.6, + "learning_rate": 4.482990075599664e-05, + "loss": 0.6484, + "step": 69000 + }, + { + "epoch": 1.62, + "learning_rate": 4.479101203994649e-05, + "loss": 0.6421, + "step": 69500 + }, + { + "epoch": 1.63, + "learning_rate": 4.475220110132844e-05, + "loss": 0.6498, + "step": 70000 + }, + { + "epoch": 1.64, + "learning_rate": 4.4713312385278286e-05, + "loss": 0.6497, + "step": 70500 + }, + { + "epoch": 1.65, + "learning_rate": 4.467442366922814e-05, + "loss": 0.6503, + "step": 71000 + }, + { + "epoch": 1.66, + "learning_rate": 4.463553495317799e-05, + "loss": 0.6314, + "step": 71500 + }, + { + "epoch": 1.67, + "learning_rate": 4.459672401455994e-05, + "loss": 0.6495, + "step": 72000 + }, + { + "epoch": 1.69, + "learning_rate": 4.4557835298509784e-05, + "loss": 0.643, + "step": 72500 + }, + { + "epoch": 1.7, + "learning_rate": 4.4518946582459635e-05, + "loss": 0.6542, + "step": 73000 + }, + { + "epoch": 1.71, + "learning_rate": 4.448005786640949e-05, + "loss": 0.6522, + "step": 73500 + }, + { + "epoch": 1.72, + "learning_rate": 4.444116915035934e-05, + "loss": 0.6476, + "step": 74000 + }, + { + "epoch": 1.73, + "learning_rate": 4.440228043430918e-05, + "loss": 0.6337, + "step": 74500 + }, + { + "epoch": 1.74, + "learning_rate": 4.4363391718259035e-05, + "loss": 0.6546, + "step": 75000 + }, + { + "epoch": 1.75, + "learning_rate": 4.4324503002208886e-05, + "loss": 0.647, + "step": 75500 + }, + { + "epoch": 1.77, + "learning_rate": 4.4285614286158724e-05, + "loss": 0.6467, + "step": 76000 + }, + { + "epoch": 1.78, + "learning_rate": 4.4246725570108576e-05, + "loss": 0.6432, + "step": 76500 + }, + { + "epoch": 1.79, + "learning_rate": 4.420783685405843e-05, + "loss": 0.6399, + "step": 77000 + }, + { + "epoch": 1.8, + "learning_rate": 4.416894813800828e-05, + "loss": 0.6382, + "step": 77500 + }, + { + "epoch": 1.81, + "learning_rate": 4.4130059421958123e-05, + "loss": 0.649, + "step": 78000 + }, + { + "epoch": 1.82, + "learning_rate": 4.409124848334007e-05, + "loss": 0.6385, + "step": 78500 + }, + { + "epoch": 1.84, + "learning_rate": 4.4052359767289925e-05, + "loss": 0.6448, + "step": 79000 + }, + { + "epoch": 1.85, + "learning_rate": 4.4013471051239776e-05, + "loss": 0.638, + "step": 79500 + }, + { + "epoch": 1.86, + "learning_rate": 4.397458233518962e-05, + "loss": 0.6317, + "step": 80000 + }, + { + "epoch": 1.87, + "learning_rate": 4.393569361913947e-05, + "loss": 0.6338, + "step": 80500 + }, + { + "epoch": 1.88, + "learning_rate": 4.3896804903089324e-05, + "loss": 0.6406, + "step": 81000 + }, + { + "epoch": 1.89, + "learning_rate": 4.385791618703917e-05, + "loss": 0.6363, + "step": 81500 + }, + { + "epoch": 1.91, + "learning_rate": 4.381902747098902e-05, + "loss": 0.6381, + "step": 82000 + }, + { + "epoch": 1.92, + "learning_rate": 4.378021653237097e-05, + "loss": 0.6383, + "step": 82500 + }, + { + "epoch": 1.93, + "learning_rate": 4.374140559375292e-05, + "loss": 0.6351, + "step": 83000 + }, + { + "epoch": 1.94, + "learning_rate": 4.370251687770277e-05, + "loss": 0.642, + "step": 83500 + }, + { + "epoch": 1.95, + "learning_rate": 4.3663705939084714e-05, + "loss": 0.6399, + "step": 84000 + }, + { + "epoch": 1.96, + "learning_rate": 4.3624817223034566e-05, + "loss": 0.6351, + "step": 84500 + }, + { + "epoch": 1.98, + "learning_rate": 4.358592850698442e-05, + "loss": 0.631, + "step": 85000 + }, + { + "epoch": 1.99, + "learning_rate": 4.354703979093426e-05, + "loss": 0.6298, + "step": 85500 + }, + { + "epoch": 2.0, + "learning_rate": 4.350822885231621e-05, + "loss": 0.6349, + "step": 86000 + }, + { + "epoch": 2.0, + "eval_bleu": 58.202, + "eval_gen_len": 15.9466, + "eval_loss": 0.6546894311904907, + "eval_runtime": 6844.8665, + "eval_samples_per_second": 12.552, + "eval_steps_per_second": 1.569, + "step": 86048 + }, + { + "epoch": 2.01, + "learning_rate": 4.346934013626606e-05, + "loss": 0.5093, + "step": 86500 + }, + { + "epoch": 2.02, + "learning_rate": 4.3430451420215914e-05, + "loss": 0.5066, + "step": 87000 + }, + { + "epoch": 2.03, + "learning_rate": 4.339156270416576e-05, + "loss": 0.4956, + "step": 87500 + }, + { + "epoch": 2.05, + "learning_rate": 4.335267398811561e-05, + "loss": 0.5, + "step": 88000 + }, + { + "epoch": 2.06, + "learning_rate": 4.331378527206546e-05, + "loss": 0.5034, + "step": 88500 + }, + { + "epoch": 2.07, + "learning_rate": 4.327489655601531e-05, + "loss": 0.505, + "step": 89000 + }, + { + "epoch": 2.08, + "learning_rate": 4.323600783996516e-05, + "loss": 0.4995, + "step": 89500 + }, + { + "epoch": 2.09, + "learning_rate": 4.319719690134711e-05, + "loss": 0.5064, + "step": 90000 + }, + { + "epoch": 2.1, + "learning_rate": 4.315830818529696e-05, + "loss": 0.5065, + "step": 90500 + }, + { + "epoch": 2.12, + "learning_rate": 4.311949724667891e-05, + "loss": 0.5031, + "step": 91000 + }, + { + "epoch": 2.13, + "learning_rate": 4.3080608530628754e-05, + "loss": 0.4967, + "step": 91500 + }, + { + "epoch": 2.14, + "learning_rate": 4.30417198145786e-05, + "loss": 0.5112, + "step": 92000 + }, + { + "epoch": 2.15, + "learning_rate": 4.300290887596055e-05, + "loss": 0.5069, + "step": 92500 + }, + { + "epoch": 2.16, + "learning_rate": 4.29640201599104e-05, + "loss": 0.5106, + "step": 93000 + }, + { + "epoch": 2.17, + "learning_rate": 4.292513144386025e-05, + "loss": 0.5077, + "step": 93500 + }, + { + "epoch": 2.18, + "learning_rate": 4.28862427278101e-05, + "loss": 0.5179, + "step": 94000 + }, + { + "epoch": 2.2, + "learning_rate": 4.284735401175995e-05, + "loss": 0.5192, + "step": 94500 + }, + { + "epoch": 2.21, + "learning_rate": 4.28084652957098e-05, + "loss": 0.5103, + "step": 95000 + }, + { + "epoch": 2.22, + "learning_rate": 4.276957657965965e-05, + "loss": 0.5203, + "step": 95500 + }, + { + "epoch": 2.23, + "learning_rate": 4.2730687863609496e-05, + "loss": 0.5179, + "step": 96000 + }, + { + "epoch": 2.24, + "learning_rate": 4.269179914755935e-05, + "loss": 0.515, + "step": 96500 + }, + { + "epoch": 2.25, + "learning_rate": 4.26529104315092e-05, + "loss": 0.5133, + "step": 97000 + }, + { + "epoch": 2.27, + "learning_rate": 4.2614021715459043e-05, + "loss": 0.5147, + "step": 97500 + }, + { + "epoch": 2.28, + "learning_rate": 4.2575132999408895e-05, + "loss": 0.5308, + "step": 98000 + }, + { + "epoch": 2.29, + "learning_rate": 4.2536322060790845e-05, + "loss": 0.5207, + "step": 98500 + }, + { + "epoch": 2.3, + "learning_rate": 4.2497511122172794e-05, + "loss": 0.5228, + "step": 99000 + }, + { + "epoch": 2.31, + "learning_rate": 4.245862240612264e-05, + "loss": 0.5207, + "step": 99500 + }, + { + "epoch": 2.32, + "learning_rate": 4.241973369007249e-05, + "loss": 0.5211, + "step": 100000 + }, + { + "epoch": 2.34, + "learning_rate": 4.2380844974022335e-05, + "loss": 0.5169, + "step": 100500 + }, + { + "epoch": 2.35, + "learning_rate": 4.234195625797219e-05, + "loss": 0.5302, + "step": 101000 + }, + { + "epoch": 2.36, + "learning_rate": 4.230306754192204e-05, + "loss": 0.5155, + "step": 101500 + }, + { + "epoch": 2.37, + "learning_rate": 4.226417882587188e-05, + "loss": 0.5193, + "step": 102000 + }, + { + "epoch": 2.38, + "learning_rate": 4.2225290109821735e-05, + "loss": 0.518, + "step": 102500 + }, + { + "epoch": 2.39, + "learning_rate": 4.2186401393771586e-05, + "loss": 0.5171, + "step": 103000 + }, + { + "epoch": 2.41, + "learning_rate": 4.214751267772144e-05, + "loss": 0.5208, + "step": 103500 + }, + { + "epoch": 2.42, + "learning_rate": 4.210862396167128e-05, + "loss": 0.5231, + "step": 104000 + }, + { + "epoch": 2.43, + "learning_rate": 4.2069735245621134e-05, + "loss": 0.5267, + "step": 104500 + }, + { + "epoch": 2.44, + "learning_rate": 4.2030846529570985e-05, + "loss": 0.5196, + "step": 105000 + }, + { + "epoch": 2.45, + "learning_rate": 4.199195781352083e-05, + "loss": 0.5234, + "step": 105500 + }, + { + "epoch": 2.46, + "learning_rate": 4.1953224652334885e-05, + "loss": 0.526, + "step": 106000 + }, + { + "epoch": 2.48, + "learning_rate": 4.191449149114893e-05, + "loss": 0.5312, + "step": 106500 + }, + { + "epoch": 2.49, + "learning_rate": 4.187560277509878e-05, + "loss": 0.5286, + "step": 107000 + }, + { + "epoch": 2.5, + "learning_rate": 4.183671405904863e-05, + "loss": 0.5225, + "step": 107500 + }, + { + "epoch": 2.51, + "learning_rate": 4.1797825342998474e-05, + "loss": 0.5207, + "step": 108000 + }, + { + "epoch": 2.52, + "learning_rate": 4.1758936626948325e-05, + "loss": 0.5263, + "step": 108500 + }, + { + "epoch": 2.53, + "learning_rate": 4.172004791089818e-05, + "loss": 0.5243, + "step": 109000 + }, + { + "epoch": 2.55, + "learning_rate": 4.1681236972280126e-05, + "loss": 0.5332, + "step": 109500 + }, + { + "epoch": 2.56, + "learning_rate": 4.164234825622997e-05, + "loss": 0.5297, + "step": 110000 + }, + { + "epoch": 2.57, + "learning_rate": 4.160345954017982e-05, + "loss": 0.5274, + "step": 110500 + }, + { + "epoch": 2.58, + "learning_rate": 4.1564570824129674e-05, + "loss": 0.5241, + "step": 111000 + }, + { + "epoch": 2.59, + "learning_rate": 4.152568210807952e-05, + "loss": 0.528, + "step": 111500 + }, + { + "epoch": 2.6, + "learning_rate": 4.148679339202937e-05, + "loss": 0.53, + "step": 112000 + }, + { + "epoch": 2.61, + "learning_rate": 4.144790467597922e-05, + "loss": 0.5291, + "step": 112500 + }, + { + "epoch": 2.63, + "learning_rate": 4.1409015959929073e-05, + "loss": 0.5431, + "step": 113000 + }, + { + "epoch": 2.64, + "learning_rate": 4.137012724387892e-05, + "loss": 0.5336, + "step": 113500 + }, + { + "epoch": 2.65, + "learning_rate": 4.133123852782877e-05, + "loss": 0.531, + "step": 114000 + }, + { + "epoch": 2.66, + "learning_rate": 4.129234981177862e-05, + "loss": 0.5278, + "step": 114500 + }, + { + "epoch": 2.67, + "learning_rate": 4.1253461095728466e-05, + "loss": 0.5214, + "step": 115000 + }, + { + "epoch": 2.68, + "learning_rate": 4.121457237967831e-05, + "loss": 0.5249, + "step": 115500 + }, + { + "epoch": 2.7, + "learning_rate": 4.117568366362816e-05, + "loss": 0.5351, + "step": 116000 + }, + { + "epoch": 2.71, + "learning_rate": 4.1136794947578014e-05, + "loss": 0.5275, + "step": 116500 + }, + { + "epoch": 2.72, + "learning_rate": 4.109790623152786e-05, + "loss": 0.5309, + "step": 117000 + }, + { + "epoch": 2.73, + "learning_rate": 4.105901751547771e-05, + "loss": 0.5291, + "step": 117500 + }, + { + "epoch": 2.74, + "learning_rate": 4.102012879942756e-05, + "loss": 0.5274, + "step": 118000 + }, + { + "epoch": 2.75, + "learning_rate": 4.098131786080951e-05, + "loss": 0.532, + "step": 118500 + }, + { + "epoch": 2.77, + "learning_rate": 4.0942429144759356e-05, + "loss": 0.5278, + "step": 119000 + }, + { + "epoch": 2.78, + "learning_rate": 4.090354042870921e-05, + "loss": 0.5338, + "step": 119500 + }, + { + "epoch": 2.79, + "learning_rate": 4.086465171265906e-05, + "loss": 0.5289, + "step": 120000 + }, + { + "epoch": 2.8, + "learning_rate": 4.082576299660891e-05, + "loss": 0.5388, + "step": 120500 + }, + { + "epoch": 2.81, + "learning_rate": 4.0786874280558755e-05, + "loss": 0.5337, + "step": 121000 + }, + { + "epoch": 2.82, + "learning_rate": 4.074798556450861e-05, + "loss": 0.5308, + "step": 121500 + }, + { + "epoch": 2.84, + "learning_rate": 4.0709174625890556e-05, + "loss": 0.5294, + "step": 122000 + }, + { + "epoch": 2.85, + "learning_rate": 4.067028590984041e-05, + "loss": 0.5385, + "step": 122500 + }, + { + "epoch": 2.86, + "learning_rate": 4.063139719379025e-05, + "loss": 0.5379, + "step": 123000 + }, + { + "epoch": 2.87, + "learning_rate": 4.05925084777401e-05, + "loss": 0.5485, + "step": 123500 + }, + { + "epoch": 2.88, + "learning_rate": 4.055361976168995e-05, + "loss": 0.5391, + "step": 124000 + }, + { + "epoch": 2.89, + "learning_rate": 4.05147310456398e-05, + "loss": 0.5414, + "step": 124500 + }, + { + "epoch": 2.91, + "learning_rate": 4.0475842329589645e-05, + "loss": 0.5237, + "step": 125000 + }, + { + "epoch": 2.92, + "learning_rate": 4.04369536135395e-05, + "loss": 0.5363, + "step": 125500 + }, + { + "epoch": 2.93, + "learning_rate": 4.039806489748935e-05, + "loss": 0.543, + "step": 126000 + }, + { + "epoch": 2.94, + "learning_rate": 4.0359331736303396e-05, + "loss": 0.5265, + "step": 126500 + }, + { + "epoch": 2.95, + "learning_rate": 4.032044302025325e-05, + "loss": 0.5379, + "step": 127000 + }, + { + "epoch": 2.96, + "learning_rate": 4.028155430420309e-05, + "loss": 0.5311, + "step": 127500 + }, + { + "epoch": 2.98, + "learning_rate": 4.0242665588152944e-05, + "loss": 0.5361, + "step": 128000 + }, + { + "epoch": 2.99, + "learning_rate": 4.0203776872102795e-05, + "loss": 0.5405, + "step": 128500 + }, + { + "epoch": 3.0, + "learning_rate": 4.016488815605265e-05, + "loss": 0.537, + "step": 129000 + }, + { + "epoch": 3.0, + "eval_bleu": 59.1835, + "eval_gen_len": 15.7226, + "eval_loss": 0.6417234539985657, + "eval_runtime": 6645.2011, + "eval_samples_per_second": 12.929, + "eval_steps_per_second": 1.616, + "step": 129072 + } + ], + "logging_steps": 500, + "max_steps": 645360, + "num_input_tokens_seen": 0, + "num_train_epochs": 15, + "save_steps": 500, + "total_flos": 4.4754523745125663e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-129072/training_args.bin b/checkpoint-129072/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..c71a9dade62e1afff1c0282f02a162933d87afd7 --- /dev/null +++ b/checkpoint-129072/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64d80f6465473a02dd91018cea5b0675845684097cfa8269a7e746c79607dbe1 +size 4856 diff --git a/checkpoint-215120/config.json b/checkpoint-215120/config.json new file mode 100644 index 0000000000000000000000000000000000000000..77b8612fc6bf9e6aaa7560c8bcce3e4c9c6986de --- /dev/null +++ b/checkpoint-215120/config.json @@ -0,0 +1,59 @@ +{ + "_name_or_path": "facebook/mbart-large-50-many-to-many-mmt", + "_num_labels": 3, + "activation_dropout": 0.0, + "activation_function": "relu", + "add_bias_logits": false, + "add_final_layer_norm": true, + "architectures": [ + "MBartForConditionalGeneration" + ], + "attention_dropout": 0.0, + "bos_token_id": 0, + "classif_dropout": 0.0, + "classifier_dropout": 0.0, + "d_model": 1024, + "decoder_attention_heads": 16, + "decoder_ffn_dim": 4096, + "decoder_layerdrop": 0.0, + "decoder_layers": 12, + "decoder_start_token_id": 2, + "dropout": 0.1, + "early_stopping": true, + "encoder_attention_heads": 16, + "encoder_ffn_dim": 4096, + "encoder_layerdrop": 0.0, + "encoder_layers": 12, + "eos_token_id": 2, + "forced_bos_token_id": 250014, + "forced_eos_token_id": 2, + "gradient_checkpointing": false, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1", + "2": "LABEL_2" + }, + "init_std": 0.02, + "is_encoder_decoder": true, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1, + "LABEL_2": 2 + }, + "max_length": 200, + "max_position_embeddings": 1024, + "model_type": "mbart", + "normalize_before": true, + "normalize_embedding": true, + "num_beams": 5, + "num_hidden_layers": 12, + "output_past": true, + "pad_token_id": 1, + "scale_embedding": true, + "static_position_embeddings": false, + "tokenizer_class": "MBart50Tokenizer", + "torch_dtype": "float32", + "transformers_version": "4.37.2", + "use_cache": true, + "vocab_size": 250054 +} diff --git a/checkpoint-215120/generation_config.json b/checkpoint-215120/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b1d0b25bc2cc841a32d579c94d80eb6323756b7a --- /dev/null +++ b/checkpoint-215120/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 0, + "decoder_start_token_id": 2, + "early_stopping": true, + "eos_token_id": 2, + "forced_bos_token_id": 250014, + "forced_eos_token_id": 2, + "max_length": 200, + "num_beams": 5, + "pad_token_id": 1, + "transformers_version": "4.37.2" +} diff --git a/checkpoint-215120/model.safetensors b/checkpoint-215120/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a397a8c525772b8d10df5348ac0c0b34e57f4182 --- /dev/null +++ b/checkpoint-215120/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a7760d1a10fd0889e8dba0964ed5ffc622c89647b7918b9fca6d301b692522f +size 2444578688 diff --git a/checkpoint-215120/optimizer.pt b/checkpoint-215120/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..762bbf4cdecf224ee66f29f14c1fd2cdf0287134 --- /dev/null +++ b/checkpoint-215120/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adcd2363a5d432573828e1d9d2b92cdbdfbd92646e3067bd4465a37613da3887 +size 4887473903 diff --git a/checkpoint-215120/rng_state_0.pth b/checkpoint-215120/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..a3bce19ab8061ade350aea3ea4d57ef8a0ebca49 --- /dev/null +++ b/checkpoint-215120/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7515d96721222705c1d9c34d455f560c3b77f82a13b920c0bef14643de49d9ce +size 15024 diff --git a/checkpoint-215120/rng_state_1.pth b/checkpoint-215120/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..5b16db441559758c319c8b6667cedf7782ead0c9 --- /dev/null +++ b/checkpoint-215120/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca4568a8d60873c5bf1b9c72b300b6100f45b290cb66334f1ddb796af2cf8deb +size 15024 diff --git a/checkpoint-215120/rng_state_2.pth b/checkpoint-215120/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..686f8e0219af7f6a4d5c21bcf0b46de301f0eef2 --- /dev/null +++ b/checkpoint-215120/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7475c977d26729e48196777eb45ec738365fa88792965fed2836077fb5cc4c45 +size 15024 diff --git a/checkpoint-215120/rng_state_3.pth b/checkpoint-215120/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..bf5779e2ab4044fe3b17b06e1e2c3bee71739806 --- /dev/null +++ b/checkpoint-215120/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6b4d5fde944e861202e0796f9a382733f3150a8996778aeb3e4260a63f0eb87 +size 15024 diff --git a/checkpoint-215120/scheduler.pt b/checkpoint-215120/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..577389a48238e0a8dfa4c7be1da8fc48a53c9869 --- /dev/null +++ b/checkpoint-215120/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f448f718e21c508c439c4ff08aaa5a81a6ba38f829b244c7cc53747e509d5574 +size 1064 diff --git a/checkpoint-215120/sentencepiece.bpe.model b/checkpoint-215120/sentencepiece.bpe.model new file mode 100644 index 0000000000000000000000000000000000000000..7a3f40a75f870bc1f21700cd414dc2acc431583c --- /dev/null +++ b/checkpoint-215120/sentencepiece.bpe.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865 +size 5069051 diff --git a/checkpoint-215120/special_tokens_map.json b/checkpoint-215120/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..92619141640d5fcbb4429807de2248352b0dca79 --- /dev/null +++ b/checkpoint-215120/special_tokens_map.json @@ -0,0 +1,69 @@ +{ + "additional_special_tokens": [ + "ar_AR", + "cs_CZ", + "de_DE", + "en_XX", + "es_XX", + "et_EE", + "fi_FI", + "fr_XX", + "gu_IN", + "hi_IN", + "it_IT", + "ja_XX", + "kk_KZ", + "ko_KR", + "lt_LT", + "lv_LV", + "my_MM", + "ne_NP", + "nl_XX", + "ro_RO", + "ru_RU", + "si_LK", + "tr_TR", + "vi_VN", + "zh_CN", + "af_ZA", + "az_AZ", + "bn_IN", + "fa_IR", + "he_IL", + "hr_HR", + "id_ID", + "ka_GE", + "km_KH", + "mk_MK", + "ml_IN", + "mn_MN", + "mr_IN", + "pl_PL", + "ps_AF", + "pt_XX", + "sv_SE", + "sw_KE", + "ta_IN", + "te_IN", + "th_TH", + "tl_XX", + "uk_UA", + "ur_PK", + "xh_ZA", + "gl_ES", + "sl_SI" + ], + "bos_token": "", + "cls_token": "", + "eos_token": "", + "mask_token": { + "content": "", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "sep_token": "", + "unk_token": "" +} diff --git a/checkpoint-215120/tokenizer.json b/checkpoint-215120/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..ecc6a4f3075bc2a01607c72e81fd24456ab68311 --- /dev/null +++ b/checkpoint-215120/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfb9b1f3e7ce9f6c1a5ab4560578eda3329db396be400909c5d34c8d0b08b0ed +size 17110208 diff --git a/checkpoint-215120/tokenizer_config.json b/checkpoint-215120/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..70c0515a5815fcc727e11e053116348bfac12128 --- /dev/null +++ b/checkpoint-215120/tokenizer_config.json @@ -0,0 +1,528 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250001": { + "content": "ar_AR", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250002": { + "content": "cs_CZ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250003": { + "content": "de_DE", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250004": { + "content": "en_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250005": { + "content": "es_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250006": { + "content": "et_EE", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250007": { + "content": "fi_FI", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250008": { + "content": "fr_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250009": { + "content": "gu_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250010": { + "content": "hi_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250011": { + "content": "it_IT", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250012": { + "content": "ja_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250013": { + "content": "kk_KZ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250014": { + "content": "ko_KR", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250015": { + "content": "lt_LT", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250016": { + "content": "lv_LV", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250017": { + "content": "my_MM", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250018": { + "content": "ne_NP", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250019": { + "content": "nl_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250020": { + "content": "ro_RO", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250021": { + "content": "ru_RU", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250022": { + "content": "si_LK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250023": { + "content": "tr_TR", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250024": { + "content": "vi_VN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250025": { + "content": "zh_CN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250026": { + "content": "af_ZA", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250027": { + "content": "az_AZ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250028": { + "content": "bn_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250029": { + "content": "fa_IR", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250030": { + "content": "he_IL", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250031": { + "content": "hr_HR", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250032": { + "content": "id_ID", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250033": { + "content": "ka_GE", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250034": { + "content": "km_KH", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250035": { + "content": "mk_MK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250036": { + "content": "ml_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250037": { + "content": "mn_MN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250038": { + "content": "mr_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250039": { + "content": "pl_PL", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250040": { + "content": "ps_AF", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250041": { + "content": "pt_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250042": { + "content": "sv_SE", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250043": { + "content": "sw_KE", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250044": { + "content": "ta_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250045": { + "content": "te_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250046": { + "content": "th_TH", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250047": { + "content": "tl_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250048": { + "content": "uk_UA", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250049": { + "content": "ur_PK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250050": { + "content": "xh_ZA", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250051": { + "content": "gl_ES", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250052": { + "content": "sl_SI", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250053": { + "content": "", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "ar_AR", + "cs_CZ", + "de_DE", + "en_XX", + "es_XX", + "et_EE", + "fi_FI", + "fr_XX", + "gu_IN", + "hi_IN", + "it_IT", + "ja_XX", + "kk_KZ", + "ko_KR", + "lt_LT", + "lv_LV", + "my_MM", + "ne_NP", + "nl_XX", + "ro_RO", + "ru_RU", + "si_LK", + "tr_TR", + "vi_VN", + "zh_CN", + "af_ZA", + "az_AZ", + "bn_IN", + "fa_IR", + "he_IL", + "hr_HR", + "id_ID", + "ka_GE", + "km_KH", + "mk_MK", + "ml_IN", + "mn_MN", + "mr_IN", + "pl_PL", + "ps_AF", + "pt_XX", + "sv_SE", + "sw_KE", + "ta_IN", + "te_IN", + "th_TH", + "tl_XX", + "uk_UA", + "ur_PK", + "xh_ZA", + "gl_ES", + "sl_SI" + ], + "bos_token": "", + "clean_up_tokenization_spaces": true, + "cls_token": "", + "eos_token": "", + "language_codes": "ML50", + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sep_token": "", + "sp_model_kwargs": {}, + "src_lang": "ja_XX", + "tgt_lang": "ko_KR", + "tokenizer_class": "MBart50Tokenizer", + "unk_token": "" +} diff --git a/checkpoint-215120/trainer_state.json b/checkpoint-215120/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..20781340b9f435448741cc4273269bbd10aa12fc --- /dev/null +++ b/checkpoint-215120/trainer_state.json @@ -0,0 +1,2651 @@ +{ + "best_metric": 0.6417234539985657, + "best_model_checkpoint": "./enko_mbartLarge_100p_sup2/checkpoint-129072", + "epoch": 5.0, + "eval_steps": 500, + "global_step": 215120, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 9.940000000000001e-06, + "loss": 2.2901, + "step": 500 + }, + { + "epoch": 0.02, + "learning_rate": 1.992e-05, + "loss": 1.6084, + "step": 1000 + }, + { + "epoch": 0.03, + "learning_rate": 2.9920000000000005e-05, + "loss": 1.449, + "step": 1500 + }, + { + "epoch": 0.05, + "learning_rate": 3.9920000000000004e-05, + "loss": 1.3874, + "step": 2000 + }, + { + "epoch": 0.06, + "learning_rate": 4.992e-05, + "loss": 1.3485, + "step": 2500 + }, + { + "epoch": 0.07, + "learning_rate": 4.996142239367825e-05, + "loss": 1.2926, + "step": 3000 + }, + { + "epoch": 0.08, + "learning_rate": 4.9922533677628105e-05, + "loss": 1.248, + "step": 3500 + }, + { + "epoch": 0.09, + "learning_rate": 4.988364496157795e-05, + "loss": 1.1931, + "step": 4000 + }, + { + "epoch": 0.1, + "learning_rate": 4.98447562455278e-05, + "loss": 1.1677, + "step": 4500 + }, + { + "epoch": 0.12, + "learning_rate": 4.980594530690975e-05, + "loss": 1.142, + "step": 5000 + }, + { + "epoch": 0.13, + "learning_rate": 4.97670565908596e-05, + "loss": 1.122, + "step": 5500 + }, + { + "epoch": 0.14, + "learning_rate": 4.972816787480945e-05, + "loss": 1.0855, + "step": 6000 + }, + { + "epoch": 0.15, + "learning_rate": 4.968927915875929e-05, + "loss": 1.0719, + "step": 6500 + }, + { + "epoch": 0.16, + "learning_rate": 4.965039044270914e-05, + "loss": 1.053, + "step": 7000 + }, + { + "epoch": 0.17, + "learning_rate": 4.9611501726658995e-05, + "loss": 1.0359, + "step": 7500 + }, + { + "epoch": 0.19, + "learning_rate": 4.957261301060884e-05, + "loss": 1.0232, + "step": 8000 + }, + { + "epoch": 0.2, + "learning_rate": 4.953380207199079e-05, + "loss": 1.0169, + "step": 8500 + }, + { + "epoch": 0.21, + "learning_rate": 4.949491335594064e-05, + "loss": 1.0141, + "step": 9000 + }, + { + "epoch": 0.22, + "learning_rate": 4.945602463989049e-05, + "loss": 0.9855, + "step": 9500 + }, + { + "epoch": 0.23, + "learning_rate": 4.941721370127244e-05, + "loss": 0.9932, + "step": 10000 + }, + { + "epoch": 0.24, + "learning_rate": 4.9378324985222293e-05, + "loss": 0.9656, + "step": 10500 + }, + { + "epoch": 0.26, + "learning_rate": 4.933943626917214e-05, + "loss": 0.9694, + "step": 11000 + }, + { + "epoch": 0.27, + "learning_rate": 4.930054755312199e-05, + "loss": 0.9609, + "step": 11500 + }, + { + "epoch": 0.28, + "learning_rate": 4.926165883707184e-05, + "loss": 0.9526, + "step": 12000 + }, + { + "epoch": 0.29, + "learning_rate": 4.9222770121021686e-05, + "loss": 0.9359, + "step": 12500 + }, + { + "epoch": 0.3, + "learning_rate": 4.9183959182403636e-05, + "loss": 0.9324, + "step": 13000 + }, + { + "epoch": 0.31, + "learning_rate": 4.914507046635349e-05, + "loss": 0.9251, + "step": 13500 + }, + { + "epoch": 0.33, + "learning_rate": 4.910625952773543e-05, + "loss": 0.9216, + "step": 14000 + }, + { + "epoch": 0.34, + "learning_rate": 4.906737081168528e-05, + "loss": 0.9181, + "step": 14500 + }, + { + "epoch": 0.35, + "learning_rate": 4.902855987306723e-05, + "loss": 0.9098, + "step": 15000 + }, + { + "epoch": 0.36, + "learning_rate": 4.898967115701708e-05, + "loss": 0.9067, + "step": 15500 + }, + { + "epoch": 0.37, + "learning_rate": 4.895078244096693e-05, + "loss": 0.8931, + "step": 16000 + }, + { + "epoch": 0.38, + "learning_rate": 4.891189372491678e-05, + "loss": 0.8923, + "step": 16500 + }, + { + "epoch": 0.4, + "learning_rate": 4.887300500886663e-05, + "loss": 0.8703, + "step": 17000 + }, + { + "epoch": 0.41, + "learning_rate": 4.8834116292816475e-05, + "loss": 0.8861, + "step": 17500 + }, + { + "epoch": 0.42, + "learning_rate": 4.879522757676633e-05, + "loss": 0.8864, + "step": 18000 + }, + { + "epoch": 0.43, + "learning_rate": 4.875633886071618e-05, + "loss": 0.886, + "step": 18500 + }, + { + "epoch": 0.44, + "learning_rate": 4.871752792209813e-05, + "loss": 0.8737, + "step": 19000 + }, + { + "epoch": 0.45, + "learning_rate": 4.867863920604798e-05, + "loss": 0.8708, + "step": 19500 + }, + { + "epoch": 0.46, + "learning_rate": 4.8639750489997824e-05, + "loss": 0.865, + "step": 20000 + }, + { + "epoch": 0.48, + "learning_rate": 4.8600861773947676e-05, + "loss": 0.8571, + "step": 20500 + }, + { + "epoch": 0.49, + "learning_rate": 4.856197305789753e-05, + "loss": 0.8621, + "step": 21000 + }, + { + "epoch": 0.5, + "learning_rate": 4.852308434184737e-05, + "loss": 0.8607, + "step": 21500 + }, + { + "epoch": 0.51, + "learning_rate": 4.848419562579722e-05, + "loss": 0.8519, + "step": 22000 + }, + { + "epoch": 0.52, + "learning_rate": 4.844530690974707e-05, + "loss": 0.8472, + "step": 22500 + }, + { + "epoch": 0.53, + "learning_rate": 4.840641819369692e-05, + "loss": 0.8381, + "step": 23000 + }, + { + "epoch": 0.55, + "learning_rate": 4.8367529477646765e-05, + "loss": 0.8329, + "step": 23500 + }, + { + "epoch": 0.56, + "learning_rate": 4.8328640761596616e-05, + "loss": 0.8425, + "step": 24000 + }, + { + "epoch": 0.57, + "learning_rate": 4.828975204554647e-05, + "loss": 0.8227, + "step": 24500 + }, + { + "epoch": 0.58, + "learning_rate": 4.825086332949631e-05, + "loss": 0.8349, + "step": 25000 + }, + { + "epoch": 0.59, + "learning_rate": 4.8211974613446164e-05, + "loss": 0.8235, + "step": 25500 + }, + { + "epoch": 0.6, + "learning_rate": 4.8173085897396015e-05, + "loss": 0.8116, + "step": 26000 + }, + { + "epoch": 0.62, + "learning_rate": 4.813419718134587e-05, + "loss": 0.8367, + "step": 26500 + }, + { + "epoch": 0.63, + "learning_rate": 4.809530846529571e-05, + "loss": 0.8219, + "step": 27000 + }, + { + "epoch": 0.64, + "learning_rate": 4.805641974924556e-05, + "loss": 0.8206, + "step": 27500 + }, + { + "epoch": 0.65, + "learning_rate": 4.8017531033195415e-05, + "loss": 0.8199, + "step": 28000 + }, + { + "epoch": 0.66, + "learning_rate": 4.797864231714526e-05, + "loss": 0.816, + "step": 28500 + }, + { + "epoch": 0.67, + "learning_rate": 4.793983137852721e-05, + "loss": 0.8144, + "step": 29000 + }, + { + "epoch": 0.69, + "learning_rate": 4.790102043990916e-05, + "loss": 0.806, + "step": 29500 + }, + { + "epoch": 0.7, + "learning_rate": 4.786220950129111e-05, + "loss": 0.8134, + "step": 30000 + }, + { + "epoch": 0.71, + "learning_rate": 4.782332078524095e-05, + "loss": 0.8109, + "step": 30500 + }, + { + "epoch": 0.72, + "learning_rate": 4.7784432069190805e-05, + "loss": 0.792, + "step": 31000 + }, + { + "epoch": 0.73, + "learning_rate": 4.7745543353140656e-05, + "loss": 0.8046, + "step": 31500 + }, + { + "epoch": 0.74, + "learning_rate": 4.77066546370905e-05, + "loss": 0.7983, + "step": 32000 + }, + { + "epoch": 0.76, + "learning_rate": 4.766776592104035e-05, + "loss": 0.7937, + "step": 32500 + }, + { + "epoch": 0.77, + "learning_rate": 4.7628877204990204e-05, + "loss": 0.7978, + "step": 33000 + }, + { + "epoch": 0.78, + "learning_rate": 4.758998848894005e-05, + "loss": 0.7961, + "step": 33500 + }, + { + "epoch": 0.79, + "learning_rate": 4.75510997728899e-05, + "loss": 0.7854, + "step": 34000 + }, + { + "epoch": 0.8, + "learning_rate": 4.751221105683975e-05, + "loss": 0.8006, + "step": 34500 + }, + { + "epoch": 0.81, + "learning_rate": 4.7473322340789597e-05, + "loss": 0.7857, + "step": 35000 + }, + { + "epoch": 0.83, + "learning_rate": 4.743443362473945e-05, + "loss": 0.7857, + "step": 35500 + }, + { + "epoch": 0.84, + "learning_rate": 4.73956226861214e-05, + "loss": 0.7874, + "step": 36000 + }, + { + "epoch": 0.85, + "learning_rate": 4.735673397007125e-05, + "loss": 0.7826, + "step": 36500 + }, + { + "epoch": 0.86, + "learning_rate": 4.73178452540211e-05, + "loss": 0.7902, + "step": 37000 + }, + { + "epoch": 0.87, + "learning_rate": 4.727895653797094e-05, + "loss": 0.7695, + "step": 37500 + }, + { + "epoch": 0.88, + "learning_rate": 4.724006782192079e-05, + "loss": 0.7789, + "step": 38000 + }, + { + "epoch": 0.89, + "learning_rate": 4.720117910587064e-05, + "loss": 0.7739, + "step": 38500 + }, + { + "epoch": 0.91, + "learning_rate": 4.716229038982049e-05, + "loss": 0.7726, + "step": 39000 + }, + { + "epoch": 0.92, + "learning_rate": 4.712340167377034e-05, + "loss": 0.7722, + "step": 39500 + }, + { + "epoch": 0.93, + "learning_rate": 4.708451295772019e-05, + "loss": 0.7656, + "step": 40000 + }, + { + "epoch": 0.94, + "learning_rate": 4.704562424167004e-05, + "loss": 0.77, + "step": 40500 + }, + { + "epoch": 0.95, + "learning_rate": 4.700689108048409e-05, + "loss": 0.7814, + "step": 41000 + }, + { + "epoch": 0.96, + "learning_rate": 4.696800236443394e-05, + "loss": 0.7704, + "step": 41500 + }, + { + "epoch": 0.98, + "learning_rate": 4.692919142581589e-05, + "loss": 0.7611, + "step": 42000 + }, + { + "epoch": 0.99, + "learning_rate": 4.689038048719784e-05, + "loss": 0.7629, + "step": 42500 + }, + { + "epoch": 1.0, + "learning_rate": 4.6851491771147685e-05, + "loss": 0.7676, + "step": 43000 + }, + { + "epoch": 1.0, + "eval_bleu": 55.2526, + "eval_gen_len": 16.382, + "eval_loss": 0.7125306129455566, + "eval_runtime": 7352.67, + "eval_samples_per_second": 11.685, + "eval_steps_per_second": 1.461, + "step": 43024 + }, + { + "epoch": 1.01, + "learning_rate": 4.6812603055097536e-05, + "loss": 0.6434, + "step": 43500 + }, + { + "epoch": 1.02, + "learning_rate": 4.6773792116479486e-05, + "loss": 0.6436, + "step": 44000 + }, + { + "epoch": 1.03, + "learning_rate": 4.673490340042934e-05, + "loss": 0.651, + "step": 44500 + }, + { + "epoch": 1.05, + "learning_rate": 4.669601468437918e-05, + "loss": 0.6517, + "step": 45000 + }, + { + "epoch": 1.06, + "learning_rate": 4.665712596832903e-05, + "loss": 0.6416, + "step": 45500 + }, + { + "epoch": 1.07, + "learning_rate": 4.661823725227888e-05, + "loss": 0.6517, + "step": 46000 + }, + { + "epoch": 1.08, + "learning_rate": 4.657934853622873e-05, + "loss": 0.6539, + "step": 46500 + }, + { + "epoch": 1.09, + "learning_rate": 4.654045982017858e-05, + "loss": 0.6587, + "step": 47000 + }, + { + "epoch": 1.1, + "learning_rate": 4.6501571104128426e-05, + "loss": 0.6518, + "step": 47500 + }, + { + "epoch": 1.12, + "learning_rate": 4.646268238807828e-05, + "loss": 0.6465, + "step": 48000 + }, + { + "epoch": 1.13, + "learning_rate": 4.642379367202813e-05, + "loss": 0.6437, + "step": 48500 + }, + { + "epoch": 1.14, + "learning_rate": 4.6384904955977974e-05, + "loss": 0.6458, + "step": 49000 + }, + { + "epoch": 1.15, + "learning_rate": 4.6346016239927825e-05, + "loss": 0.6476, + "step": 49500 + }, + { + "epoch": 1.16, + "learning_rate": 4.630712752387768e-05, + "loss": 0.6587, + "step": 50000 + }, + { + "epoch": 1.17, + "learning_rate": 4.626823880782752e-05, + "loss": 0.6392, + "step": 50500 + }, + { + "epoch": 1.19, + "learning_rate": 4.622935009177737e-05, + "loss": 0.6547, + "step": 51000 + }, + { + "epoch": 1.2, + "learning_rate": 4.6190461375727225e-05, + "loss": 0.6511, + "step": 51500 + }, + { + "epoch": 1.21, + "learning_rate": 4.615157265967707e-05, + "loss": 0.6522, + "step": 52000 + }, + { + "epoch": 1.22, + "learning_rate": 4.611268394362692e-05, + "loss": 0.6555, + "step": 52500 + }, + { + "epoch": 1.23, + "learning_rate": 4.607395078244097e-05, + "loss": 0.6461, + "step": 53000 + }, + { + "epoch": 1.24, + "learning_rate": 4.6035062066390814e-05, + "loss": 0.6471, + "step": 53500 + }, + { + "epoch": 1.26, + "learning_rate": 4.5996173350340665e-05, + "loss": 0.6563, + "step": 54000 + }, + { + "epoch": 1.27, + "learning_rate": 4.5957284634290517e-05, + "loss": 0.6447, + "step": 54500 + }, + { + "epoch": 1.28, + "learning_rate": 4.591839591824036e-05, + "loss": 0.6483, + "step": 55000 + }, + { + "epoch": 1.29, + "learning_rate": 4.587958497962231e-05, + "loss": 0.6475, + "step": 55500 + }, + { + "epoch": 1.3, + "learning_rate": 4.584069626357216e-05, + "loss": 0.6546, + "step": 56000 + }, + { + "epoch": 1.31, + "learning_rate": 4.5801807547522014e-05, + "loss": 0.6528, + "step": 56500 + }, + { + "epoch": 1.32, + "learning_rate": 4.5762918831471866e-05, + "loss": 0.6538, + "step": 57000 + }, + { + "epoch": 1.34, + "learning_rate": 4.572403011542171e-05, + "loss": 0.6511, + "step": 57500 + }, + { + "epoch": 1.35, + "learning_rate": 4.568521917680366e-05, + "loss": 0.6457, + "step": 58000 + }, + { + "epoch": 1.36, + "learning_rate": 4.564633046075351e-05, + "loss": 0.659, + "step": 58500 + }, + { + "epoch": 1.37, + "learning_rate": 4.560744174470336e-05, + "loss": 0.6358, + "step": 59000 + }, + { + "epoch": 1.38, + "learning_rate": 4.556855302865321e-05, + "loss": 0.6425, + "step": 59500 + }, + { + "epoch": 1.39, + "learning_rate": 4.552966431260306e-05, + "loss": 0.644, + "step": 60000 + }, + { + "epoch": 1.41, + "learning_rate": 4.5490775596552904e-05, + "loss": 0.6537, + "step": 60500 + }, + { + "epoch": 1.42, + "learning_rate": 4.5451886880502756e-05, + "loss": 0.6529, + "step": 61000 + }, + { + "epoch": 1.43, + "learning_rate": 4.5413075941884705e-05, + "loss": 0.6547, + "step": 61500 + }, + { + "epoch": 1.44, + "learning_rate": 4.537418722583455e-05, + "loss": 0.6443, + "step": 62000 + }, + { + "epoch": 1.45, + "learning_rate": 4.53352985097844e-05, + "loss": 0.6516, + "step": 62500 + }, + { + "epoch": 1.46, + "learning_rate": 4.529640979373425e-05, + "loss": 0.6439, + "step": 63000 + }, + { + "epoch": 1.48, + "learning_rate": 4.52575210776841e-05, + "loss": 0.6401, + "step": 63500 + }, + { + "epoch": 1.49, + "learning_rate": 4.521863236163395e-05, + "loss": 0.6545, + "step": 64000 + }, + { + "epoch": 1.5, + "learning_rate": 4.51797436455838e-05, + "loss": 0.6433, + "step": 64500 + }, + { + "epoch": 1.51, + "learning_rate": 4.514085492953365e-05, + "loss": 0.6538, + "step": 65000 + }, + { + "epoch": 1.52, + "learning_rate": 4.51019662134835e-05, + "loss": 0.6548, + "step": 65500 + }, + { + "epoch": 1.53, + "learning_rate": 4.506307749743335e-05, + "loss": 0.6458, + "step": 66000 + }, + { + "epoch": 1.55, + "learning_rate": 4.50241887813832e-05, + "loss": 0.649, + "step": 66500 + }, + { + "epoch": 1.56, + "learning_rate": 4.4985300065333045e-05, + "loss": 0.6471, + "step": 67000 + }, + { + "epoch": 1.57, + "learning_rate": 4.4946411349282896e-05, + "loss": 0.6526, + "step": 67500 + }, + { + "epoch": 1.58, + "learning_rate": 4.4907600410664846e-05, + "loss": 0.646, + "step": 68000 + }, + { + "epoch": 1.59, + "learning_rate": 4.486878947204679e-05, + "loss": 0.6491, + "step": 68500 + }, + { + "epoch": 1.6, + "learning_rate": 4.482990075599664e-05, + "loss": 0.6484, + "step": 69000 + }, + { + "epoch": 1.62, + "learning_rate": 4.479101203994649e-05, + "loss": 0.6421, + "step": 69500 + }, + { + "epoch": 1.63, + "learning_rate": 4.475220110132844e-05, + "loss": 0.6498, + "step": 70000 + }, + { + "epoch": 1.64, + "learning_rate": 4.4713312385278286e-05, + "loss": 0.6497, + "step": 70500 + }, + { + "epoch": 1.65, + "learning_rate": 4.467442366922814e-05, + "loss": 0.6503, + "step": 71000 + }, + { + "epoch": 1.66, + "learning_rate": 4.463553495317799e-05, + "loss": 0.6314, + "step": 71500 + }, + { + "epoch": 1.67, + "learning_rate": 4.459672401455994e-05, + "loss": 0.6495, + "step": 72000 + }, + { + "epoch": 1.69, + "learning_rate": 4.4557835298509784e-05, + "loss": 0.643, + "step": 72500 + }, + { + "epoch": 1.7, + "learning_rate": 4.4518946582459635e-05, + "loss": 0.6542, + "step": 73000 + }, + { + "epoch": 1.71, + "learning_rate": 4.448005786640949e-05, + "loss": 0.6522, + "step": 73500 + }, + { + "epoch": 1.72, + "learning_rate": 4.444116915035934e-05, + "loss": 0.6476, + "step": 74000 + }, + { + "epoch": 1.73, + "learning_rate": 4.440228043430918e-05, + "loss": 0.6337, + "step": 74500 + }, + { + "epoch": 1.74, + "learning_rate": 4.4363391718259035e-05, + "loss": 0.6546, + "step": 75000 + }, + { + "epoch": 1.75, + "learning_rate": 4.4324503002208886e-05, + "loss": 0.647, + "step": 75500 + }, + { + "epoch": 1.77, + "learning_rate": 4.4285614286158724e-05, + "loss": 0.6467, + "step": 76000 + }, + { + "epoch": 1.78, + "learning_rate": 4.4246725570108576e-05, + "loss": 0.6432, + "step": 76500 + }, + { + "epoch": 1.79, + "learning_rate": 4.420783685405843e-05, + "loss": 0.6399, + "step": 77000 + }, + { + "epoch": 1.8, + "learning_rate": 4.416894813800828e-05, + "loss": 0.6382, + "step": 77500 + }, + { + "epoch": 1.81, + "learning_rate": 4.4130059421958123e-05, + "loss": 0.649, + "step": 78000 + }, + { + "epoch": 1.82, + "learning_rate": 4.409124848334007e-05, + "loss": 0.6385, + "step": 78500 + }, + { + "epoch": 1.84, + "learning_rate": 4.4052359767289925e-05, + "loss": 0.6448, + "step": 79000 + }, + { + "epoch": 1.85, + "learning_rate": 4.4013471051239776e-05, + "loss": 0.638, + "step": 79500 + }, + { + "epoch": 1.86, + "learning_rate": 4.397458233518962e-05, + "loss": 0.6317, + "step": 80000 + }, + { + "epoch": 1.87, + "learning_rate": 4.393569361913947e-05, + "loss": 0.6338, + "step": 80500 + }, + { + "epoch": 1.88, + "learning_rate": 4.3896804903089324e-05, + "loss": 0.6406, + "step": 81000 + }, + { + "epoch": 1.89, + "learning_rate": 4.385791618703917e-05, + "loss": 0.6363, + "step": 81500 + }, + { + "epoch": 1.91, + "learning_rate": 4.381902747098902e-05, + "loss": 0.6381, + "step": 82000 + }, + { + "epoch": 1.92, + "learning_rate": 4.378021653237097e-05, + "loss": 0.6383, + "step": 82500 + }, + { + "epoch": 1.93, + "learning_rate": 4.374140559375292e-05, + "loss": 0.6351, + "step": 83000 + }, + { + "epoch": 1.94, + "learning_rate": 4.370251687770277e-05, + "loss": 0.642, + "step": 83500 + }, + { + "epoch": 1.95, + "learning_rate": 4.3663705939084714e-05, + "loss": 0.6399, + "step": 84000 + }, + { + "epoch": 1.96, + "learning_rate": 4.3624817223034566e-05, + "loss": 0.6351, + "step": 84500 + }, + { + "epoch": 1.98, + "learning_rate": 4.358592850698442e-05, + "loss": 0.631, + "step": 85000 + }, + { + "epoch": 1.99, + "learning_rate": 4.354703979093426e-05, + "loss": 0.6298, + "step": 85500 + }, + { + "epoch": 2.0, + "learning_rate": 4.350822885231621e-05, + "loss": 0.6349, + "step": 86000 + }, + { + "epoch": 2.0, + "eval_bleu": 58.202, + "eval_gen_len": 15.9466, + "eval_loss": 0.6546894311904907, + "eval_runtime": 6844.8665, + "eval_samples_per_second": 12.552, + "eval_steps_per_second": 1.569, + "step": 86048 + }, + { + "epoch": 2.01, + "learning_rate": 4.346934013626606e-05, + "loss": 0.5093, + "step": 86500 + }, + { + "epoch": 2.02, + "learning_rate": 4.3430451420215914e-05, + "loss": 0.5066, + "step": 87000 + }, + { + "epoch": 2.03, + "learning_rate": 4.339156270416576e-05, + "loss": 0.4956, + "step": 87500 + }, + { + "epoch": 2.05, + "learning_rate": 4.335267398811561e-05, + "loss": 0.5, + "step": 88000 + }, + { + "epoch": 2.06, + "learning_rate": 4.331378527206546e-05, + "loss": 0.5034, + "step": 88500 + }, + { + "epoch": 2.07, + "learning_rate": 4.327489655601531e-05, + "loss": 0.505, + "step": 89000 + }, + { + "epoch": 2.08, + "learning_rate": 4.323600783996516e-05, + "loss": 0.4995, + "step": 89500 + }, + { + "epoch": 2.09, + "learning_rate": 4.319719690134711e-05, + "loss": 0.5064, + "step": 90000 + }, + { + "epoch": 2.1, + "learning_rate": 4.315830818529696e-05, + "loss": 0.5065, + "step": 90500 + }, + { + "epoch": 2.12, + "learning_rate": 4.311949724667891e-05, + "loss": 0.5031, + "step": 91000 + }, + { + "epoch": 2.13, + "learning_rate": 4.3080608530628754e-05, + "loss": 0.4967, + "step": 91500 + }, + { + "epoch": 2.14, + "learning_rate": 4.30417198145786e-05, + "loss": 0.5112, + "step": 92000 + }, + { + "epoch": 2.15, + "learning_rate": 4.300290887596055e-05, + "loss": 0.5069, + "step": 92500 + }, + { + "epoch": 2.16, + "learning_rate": 4.29640201599104e-05, + "loss": 0.5106, + "step": 93000 + }, + { + "epoch": 2.17, + "learning_rate": 4.292513144386025e-05, + "loss": 0.5077, + "step": 93500 + }, + { + "epoch": 2.18, + "learning_rate": 4.28862427278101e-05, + "loss": 0.5179, + "step": 94000 + }, + { + "epoch": 2.2, + "learning_rate": 4.284735401175995e-05, + "loss": 0.5192, + "step": 94500 + }, + { + "epoch": 2.21, + "learning_rate": 4.28084652957098e-05, + "loss": 0.5103, + "step": 95000 + }, + { + "epoch": 2.22, + "learning_rate": 4.276957657965965e-05, + "loss": 0.5203, + "step": 95500 + }, + { + "epoch": 2.23, + "learning_rate": 4.2730687863609496e-05, + "loss": 0.5179, + "step": 96000 + }, + { + "epoch": 2.24, + "learning_rate": 4.269179914755935e-05, + "loss": 0.515, + "step": 96500 + }, + { + "epoch": 2.25, + "learning_rate": 4.26529104315092e-05, + "loss": 0.5133, + "step": 97000 + }, + { + "epoch": 2.27, + "learning_rate": 4.2614021715459043e-05, + "loss": 0.5147, + "step": 97500 + }, + { + "epoch": 2.28, + "learning_rate": 4.2575132999408895e-05, + "loss": 0.5308, + "step": 98000 + }, + { + "epoch": 2.29, + "learning_rate": 4.2536322060790845e-05, + "loss": 0.5207, + "step": 98500 + }, + { + "epoch": 2.3, + "learning_rate": 4.2497511122172794e-05, + "loss": 0.5228, + "step": 99000 + }, + { + "epoch": 2.31, + "learning_rate": 4.245862240612264e-05, + "loss": 0.5207, + "step": 99500 + }, + { + "epoch": 2.32, + "learning_rate": 4.241973369007249e-05, + "loss": 0.5211, + "step": 100000 + }, + { + "epoch": 2.34, + "learning_rate": 4.2380844974022335e-05, + "loss": 0.5169, + "step": 100500 + }, + { + "epoch": 2.35, + "learning_rate": 4.234195625797219e-05, + "loss": 0.5302, + "step": 101000 + }, + { + "epoch": 2.36, + "learning_rate": 4.230306754192204e-05, + "loss": 0.5155, + "step": 101500 + }, + { + "epoch": 2.37, + "learning_rate": 4.226417882587188e-05, + "loss": 0.5193, + "step": 102000 + }, + { + "epoch": 2.38, + "learning_rate": 4.2225290109821735e-05, + "loss": 0.518, + "step": 102500 + }, + { + "epoch": 2.39, + "learning_rate": 4.2186401393771586e-05, + "loss": 0.5171, + "step": 103000 + }, + { + "epoch": 2.41, + "learning_rate": 4.214751267772144e-05, + "loss": 0.5208, + "step": 103500 + }, + { + "epoch": 2.42, + "learning_rate": 4.210862396167128e-05, + "loss": 0.5231, + "step": 104000 + }, + { + "epoch": 2.43, + "learning_rate": 4.2069735245621134e-05, + "loss": 0.5267, + "step": 104500 + }, + { + "epoch": 2.44, + "learning_rate": 4.2030846529570985e-05, + "loss": 0.5196, + "step": 105000 + }, + { + "epoch": 2.45, + "learning_rate": 4.199195781352083e-05, + "loss": 0.5234, + "step": 105500 + }, + { + "epoch": 2.46, + "learning_rate": 4.1953224652334885e-05, + "loss": 0.526, + "step": 106000 + }, + { + "epoch": 2.48, + "learning_rate": 4.191449149114893e-05, + "loss": 0.5312, + "step": 106500 + }, + { + "epoch": 2.49, + "learning_rate": 4.187560277509878e-05, + "loss": 0.5286, + "step": 107000 + }, + { + "epoch": 2.5, + "learning_rate": 4.183671405904863e-05, + "loss": 0.5225, + "step": 107500 + }, + { + "epoch": 2.51, + "learning_rate": 4.1797825342998474e-05, + "loss": 0.5207, + "step": 108000 + }, + { + "epoch": 2.52, + "learning_rate": 4.1758936626948325e-05, + "loss": 0.5263, + "step": 108500 + }, + { + "epoch": 2.53, + "learning_rate": 4.172004791089818e-05, + "loss": 0.5243, + "step": 109000 + }, + { + "epoch": 2.55, + "learning_rate": 4.1681236972280126e-05, + "loss": 0.5332, + "step": 109500 + }, + { + "epoch": 2.56, + "learning_rate": 4.164234825622997e-05, + "loss": 0.5297, + "step": 110000 + }, + { + "epoch": 2.57, + "learning_rate": 4.160345954017982e-05, + "loss": 0.5274, + "step": 110500 + }, + { + "epoch": 2.58, + "learning_rate": 4.1564570824129674e-05, + "loss": 0.5241, + "step": 111000 + }, + { + "epoch": 2.59, + "learning_rate": 4.152568210807952e-05, + "loss": 0.528, + "step": 111500 + }, + { + "epoch": 2.6, + "learning_rate": 4.148679339202937e-05, + "loss": 0.53, + "step": 112000 + }, + { + "epoch": 2.61, + "learning_rate": 4.144790467597922e-05, + "loss": 0.5291, + "step": 112500 + }, + { + "epoch": 2.63, + "learning_rate": 4.1409015959929073e-05, + "loss": 0.5431, + "step": 113000 + }, + { + "epoch": 2.64, + "learning_rate": 4.137012724387892e-05, + "loss": 0.5336, + "step": 113500 + }, + { + "epoch": 2.65, + "learning_rate": 4.133123852782877e-05, + "loss": 0.531, + "step": 114000 + }, + { + "epoch": 2.66, + "learning_rate": 4.129234981177862e-05, + "loss": 0.5278, + "step": 114500 + }, + { + "epoch": 2.67, + "learning_rate": 4.1253461095728466e-05, + "loss": 0.5214, + "step": 115000 + }, + { + "epoch": 2.68, + "learning_rate": 4.121457237967831e-05, + "loss": 0.5249, + "step": 115500 + }, + { + "epoch": 2.7, + "learning_rate": 4.117568366362816e-05, + "loss": 0.5351, + "step": 116000 + }, + { + "epoch": 2.71, + "learning_rate": 4.1136794947578014e-05, + "loss": 0.5275, + "step": 116500 + }, + { + "epoch": 2.72, + "learning_rate": 4.109790623152786e-05, + "loss": 0.5309, + "step": 117000 + }, + { + "epoch": 2.73, + "learning_rate": 4.105901751547771e-05, + "loss": 0.5291, + "step": 117500 + }, + { + "epoch": 2.74, + "learning_rate": 4.102012879942756e-05, + "loss": 0.5274, + "step": 118000 + }, + { + "epoch": 2.75, + "learning_rate": 4.098131786080951e-05, + "loss": 0.532, + "step": 118500 + }, + { + "epoch": 2.77, + "learning_rate": 4.0942429144759356e-05, + "loss": 0.5278, + "step": 119000 + }, + { + "epoch": 2.78, + "learning_rate": 4.090354042870921e-05, + "loss": 0.5338, + "step": 119500 + }, + { + "epoch": 2.79, + "learning_rate": 4.086465171265906e-05, + "loss": 0.5289, + "step": 120000 + }, + { + "epoch": 2.8, + "learning_rate": 4.082576299660891e-05, + "loss": 0.5388, + "step": 120500 + }, + { + "epoch": 2.81, + "learning_rate": 4.0786874280558755e-05, + "loss": 0.5337, + "step": 121000 + }, + { + "epoch": 2.82, + "learning_rate": 4.074798556450861e-05, + "loss": 0.5308, + "step": 121500 + }, + { + "epoch": 2.84, + "learning_rate": 4.0709174625890556e-05, + "loss": 0.5294, + "step": 122000 + }, + { + "epoch": 2.85, + "learning_rate": 4.067028590984041e-05, + "loss": 0.5385, + "step": 122500 + }, + { + "epoch": 2.86, + "learning_rate": 4.063139719379025e-05, + "loss": 0.5379, + "step": 123000 + }, + { + "epoch": 2.87, + "learning_rate": 4.05925084777401e-05, + "loss": 0.5485, + "step": 123500 + }, + { + "epoch": 2.88, + "learning_rate": 4.055361976168995e-05, + "loss": 0.5391, + "step": 124000 + }, + { + "epoch": 2.89, + "learning_rate": 4.05147310456398e-05, + "loss": 0.5414, + "step": 124500 + }, + { + "epoch": 2.91, + "learning_rate": 4.0475842329589645e-05, + "loss": 0.5237, + "step": 125000 + }, + { + "epoch": 2.92, + "learning_rate": 4.04369536135395e-05, + "loss": 0.5363, + "step": 125500 + }, + { + "epoch": 2.93, + "learning_rate": 4.039806489748935e-05, + "loss": 0.543, + "step": 126000 + }, + { + "epoch": 2.94, + "learning_rate": 4.0359331736303396e-05, + "loss": 0.5265, + "step": 126500 + }, + { + "epoch": 2.95, + "learning_rate": 4.032044302025325e-05, + "loss": 0.5379, + "step": 127000 + }, + { + "epoch": 2.96, + "learning_rate": 4.028155430420309e-05, + "loss": 0.5311, + "step": 127500 + }, + { + "epoch": 2.98, + "learning_rate": 4.0242665588152944e-05, + "loss": 0.5361, + "step": 128000 + }, + { + "epoch": 2.99, + "learning_rate": 4.0203776872102795e-05, + "loss": 0.5405, + "step": 128500 + }, + { + "epoch": 3.0, + "learning_rate": 4.016488815605265e-05, + "loss": 0.537, + "step": 129000 + }, + { + "epoch": 3.0, + "eval_bleu": 59.1835, + "eval_gen_len": 15.7226, + "eval_loss": 0.6417234539985657, + "eval_runtime": 6645.2011, + "eval_samples_per_second": 12.929, + "eval_steps_per_second": 1.616, + "step": 129072 + }, + { + "epoch": 3.01, + "learning_rate": 4.0126077217434597e-05, + "loss": 0.4217, + "step": 129500 + }, + { + "epoch": 3.02, + "learning_rate": 4.008718850138444e-05, + "loss": 0.3942, + "step": 130000 + }, + { + "epoch": 3.03, + "learning_rate": 4.0048299785334286e-05, + "loss": 0.3948, + "step": 130500 + }, + { + "epoch": 3.04, + "learning_rate": 4.0009488846716236e-05, + "loss": 0.3895, + "step": 131000 + }, + { + "epoch": 3.06, + "learning_rate": 3.9970677908098185e-05, + "loss": 0.3912, + "step": 131500 + }, + { + "epoch": 3.07, + "learning_rate": 3.993178919204804e-05, + "loss": 0.3989, + "step": 132000 + }, + { + "epoch": 3.08, + "learning_rate": 3.989290047599789e-05, + "loss": 0.3975, + "step": 132500 + }, + { + "epoch": 3.09, + "learning_rate": 3.985401175994773e-05, + "loss": 0.4036, + "step": 133000 + }, + { + "epoch": 3.1, + "learning_rate": 3.9815123043897585e-05, + "loss": 0.4028, + "step": 133500 + }, + { + "epoch": 3.11, + "learning_rate": 3.9776234327847436e-05, + "loss": 0.3985, + "step": 134000 + }, + { + "epoch": 3.13, + "learning_rate": 3.973734561179728e-05, + "loss": 0.4002, + "step": 134500 + }, + { + "epoch": 3.14, + "learning_rate": 3.969845689574713e-05, + "loss": 0.3975, + "step": 135000 + }, + { + "epoch": 3.15, + "learning_rate": 3.9659568179696984e-05, + "loss": 0.4024, + "step": 135500 + }, + { + "epoch": 3.16, + "learning_rate": 3.962067946364683e-05, + "loss": 0.4016, + "step": 136000 + }, + { + "epoch": 3.17, + "learning_rate": 3.958179074759668e-05, + "loss": 0.4084, + "step": 136500 + }, + { + "epoch": 3.18, + "learning_rate": 3.954290203154653e-05, + "loss": 0.4054, + "step": 137000 + }, + { + "epoch": 3.2, + "learning_rate": 3.9504013315496377e-05, + "loss": 0.4061, + "step": 137500 + }, + { + "epoch": 3.21, + "learning_rate": 3.946512459944623e-05, + "loss": 0.4098, + "step": 138000 + }, + { + "epoch": 3.22, + "learning_rate": 3.942623588339607e-05, + "loss": 0.4115, + "step": 138500 + }, + { + "epoch": 3.23, + "learning_rate": 3.9387347167345924e-05, + "loss": 0.4068, + "step": 139000 + }, + { + "epoch": 3.24, + "learning_rate": 3.9348536228727874e-05, + "loss": 0.4058, + "step": 139500 + }, + { + "epoch": 3.25, + "learning_rate": 3.930964751267772e-05, + "loss": 0.4059, + "step": 140000 + }, + { + "epoch": 3.27, + "learning_rate": 3.927075879662757e-05, + "loss": 0.4145, + "step": 140500 + }, + { + "epoch": 3.28, + "learning_rate": 3.9232025635441625e-05, + "loss": 0.4104, + "step": 141000 + }, + { + "epoch": 3.29, + "learning_rate": 3.919313691939147e-05, + "loss": 0.4141, + "step": 141500 + }, + { + "epoch": 3.3, + "learning_rate": 3.915424820334132e-05, + "loss": 0.4158, + "step": 142000 + }, + { + "epoch": 3.31, + "learning_rate": 3.911535948729117e-05, + "loss": 0.4115, + "step": 142500 + }, + { + "epoch": 3.32, + "learning_rate": 3.907647077124102e-05, + "loss": 0.4197, + "step": 143000 + }, + { + "epoch": 3.34, + "learning_rate": 3.903758205519087e-05, + "loss": 0.4082, + "step": 143500 + }, + { + "epoch": 3.35, + "learning_rate": 3.899869333914072e-05, + "loss": 0.4231, + "step": 144000 + }, + { + "epoch": 3.36, + "learning_rate": 3.8959804623090565e-05, + "loss": 0.4237, + "step": 144500 + }, + { + "epoch": 3.37, + "learning_rate": 3.892091590704042e-05, + "loss": 0.4162, + "step": 145000 + }, + { + "epoch": 3.38, + "learning_rate": 3.888202719099027e-05, + "loss": 0.4189, + "step": 145500 + }, + { + "epoch": 3.39, + "learning_rate": 3.884321625237221e-05, + "loss": 0.4154, + "step": 146000 + }, + { + "epoch": 3.41, + "learning_rate": 3.880432753632206e-05, + "loss": 0.4193, + "step": 146500 + }, + { + "epoch": 3.42, + "learning_rate": 3.876543882027191e-05, + "loss": 0.4238, + "step": 147000 + }, + { + "epoch": 3.43, + "learning_rate": 3.872655010422176e-05, + "loss": 0.4274, + "step": 147500 + }, + { + "epoch": 3.44, + "learning_rate": 3.868766138817161e-05, + "loss": 0.4165, + "step": 148000 + }, + { + "epoch": 3.45, + "learning_rate": 3.8648772672121455e-05, + "loss": 0.4223, + "step": 148500 + }, + { + "epoch": 3.46, + "learning_rate": 3.860988395607131e-05, + "loss": 0.4213, + "step": 149000 + }, + { + "epoch": 3.47, + "learning_rate": 3.8571073017453256e-05, + "loss": 0.422, + "step": 149500 + }, + { + "epoch": 3.49, + "learning_rate": 3.853218430140311e-05, + "loss": 0.4245, + "step": 150000 + }, + { + "epoch": 3.5, + "learning_rate": 3.849329558535296e-05, + "loss": 0.4247, + "step": 150500 + }, + { + "epoch": 3.51, + "learning_rate": 3.8454406869302804e-05, + "loss": 0.4267, + "step": 151000 + }, + { + "epoch": 3.52, + "learning_rate": 3.8415518153252656e-05, + "loss": 0.4253, + "step": 151500 + }, + { + "epoch": 3.53, + "learning_rate": 3.837662943720251e-05, + "loss": 0.4311, + "step": 152000 + }, + { + "epoch": 3.54, + "learning_rate": 3.833774072115235e-05, + "loss": 0.4204, + "step": 152500 + }, + { + "epoch": 3.56, + "learning_rate": 3.8298852005102203e-05, + "loss": 0.4246, + "step": 153000 + }, + { + "epoch": 3.57, + "learning_rate": 3.8259963289052055e-05, + "loss": 0.4424, + "step": 153500 + }, + { + "epoch": 3.58, + "learning_rate": 3.82210745730019e-05, + "loss": 0.426, + "step": 154000 + }, + { + "epoch": 3.59, + "learning_rate": 3.8182185856951744e-05, + "loss": 0.4304, + "step": 154500 + }, + { + "epoch": 3.6, + "learning_rate": 3.8143374918333694e-05, + "loss": 0.4292, + "step": 155000 + }, + { + "epoch": 3.61, + "learning_rate": 3.8104486202283546e-05, + "loss": 0.426, + "step": 155500 + }, + { + "epoch": 3.63, + "learning_rate": 3.80655974862334e-05, + "loss": 0.4247, + "step": 156000 + }, + { + "epoch": 3.64, + "learning_rate": 3.802670877018324e-05, + "loss": 0.4271, + "step": 156500 + }, + { + "epoch": 3.65, + "learning_rate": 3.798789783156519e-05, + "loss": 0.4279, + "step": 157000 + }, + { + "epoch": 3.66, + "learning_rate": 3.794900911551504e-05, + "loss": 0.4357, + "step": 157500 + }, + { + "epoch": 3.67, + "learning_rate": 3.791019817689699e-05, + "loss": 0.435, + "step": 158000 + }, + { + "epoch": 3.68, + "learning_rate": 3.7871309460846844e-05, + "loss": 0.4336, + "step": 158500 + }, + { + "epoch": 3.7, + "learning_rate": 3.7832420744796696e-05, + "loss": 0.4303, + "step": 159000 + }, + { + "epoch": 3.71, + "learning_rate": 3.779353202874654e-05, + "loss": 0.4231, + "step": 159500 + }, + { + "epoch": 3.72, + "learning_rate": 3.775464331269639e-05, + "loss": 0.4303, + "step": 160000 + }, + { + "epoch": 3.73, + "learning_rate": 3.7715754596646244e-05, + "loss": 0.4301, + "step": 160500 + }, + { + "epoch": 3.74, + "learning_rate": 3.767686588059609e-05, + "loss": 0.4317, + "step": 161000 + }, + { + "epoch": 3.75, + "learning_rate": 3.763797716454593e-05, + "loss": 0.4421, + "step": 161500 + }, + { + "epoch": 3.77, + "learning_rate": 3.7599088448495785e-05, + "loss": 0.4338, + "step": 162000 + }, + { + "epoch": 3.78, + "learning_rate": 3.7560199732445636e-05, + "loss": 0.4315, + "step": 162500 + }, + { + "epoch": 3.79, + "learning_rate": 3.752131101639548e-05, + "loss": 0.4318, + "step": 163000 + }, + { + "epoch": 3.8, + "learning_rate": 3.748242230034533e-05, + "loss": 0.4378, + "step": 163500 + }, + { + "epoch": 3.81, + "learning_rate": 3.7443533584295184e-05, + "loss": 0.438, + "step": 164000 + }, + { + "epoch": 3.82, + "learning_rate": 3.740464486824503e-05, + "loss": 0.4385, + "step": 164500 + }, + { + "epoch": 3.84, + "learning_rate": 3.736575615219488e-05, + "loss": 0.4365, + "step": 165000 + }, + { + "epoch": 3.85, + "learning_rate": 3.732686743614473e-05, + "loss": 0.4286, + "step": 165500 + }, + { + "epoch": 3.86, + "learning_rate": 3.7287978720094576e-05, + "loss": 0.4369, + "step": 166000 + }, + { + "epoch": 3.87, + "learning_rate": 3.7249167781476526e-05, + "loss": 0.4361, + "step": 166500 + }, + { + "epoch": 3.88, + "learning_rate": 3.721027906542638e-05, + "loss": 0.4371, + "step": 167000 + }, + { + "epoch": 3.89, + "learning_rate": 3.717139034937623e-05, + "loss": 0.4351, + "step": 167500 + }, + { + "epoch": 3.9, + "learning_rate": 3.713257941075818e-05, + "loss": 0.4361, + "step": 168000 + }, + { + "epoch": 3.92, + "learning_rate": 3.709376847214013e-05, + "loss": 0.434, + "step": 168500 + }, + { + "epoch": 3.93, + "learning_rate": 3.705487975608998e-05, + "loss": 0.4408, + "step": 169000 + }, + { + "epoch": 3.94, + "learning_rate": 3.701606881747192e-05, + "loss": 0.4398, + "step": 169500 + }, + { + "epoch": 3.95, + "learning_rate": 3.6977180101421774e-05, + "loss": 0.4396, + "step": 170000 + }, + { + "epoch": 3.96, + "learning_rate": 3.693829138537162e-05, + "loss": 0.438, + "step": 170500 + }, + { + "epoch": 3.97, + "learning_rate": 3.689940266932147e-05, + "loss": 0.4374, + "step": 171000 + }, + { + "epoch": 3.99, + "learning_rate": 3.686051395327132e-05, + "loss": 0.4407, + "step": 171500 + }, + { + "epoch": 4.0, + "learning_rate": 3.682162523722117e-05, + "loss": 0.434, + "step": 172000 + }, + { + "epoch": 4.0, + "eval_bleu": 59.6194, + "eval_gen_len": 15.702, + "eval_loss": 0.6589247584342957, + "eval_runtime": 6589.9687, + "eval_samples_per_second": 13.037, + "eval_steps_per_second": 1.63, + "step": 172096 + }, + { + "epoch": 4.01, + "learning_rate": 3.678273652117102e-05, + "loss": 0.3325, + "step": 172500 + }, + { + "epoch": 4.02, + "learning_rate": 3.674384780512087e-05, + "loss": 0.3007, + "step": 173000 + }, + { + "epoch": 4.03, + "learning_rate": 3.6704959089070715e-05, + "loss": 0.305, + "step": 173500 + }, + { + "epoch": 4.04, + "learning_rate": 3.6666070373020566e-05, + "loss": 0.3016, + "step": 174000 + }, + { + "epoch": 4.06, + "learning_rate": 3.662718165697042e-05, + "loss": 0.2961, + "step": 174500 + }, + { + "epoch": 4.07, + "learning_rate": 3.6588526273216564e-05, + "loss": 0.3007, + "step": 175000 + }, + { + "epoch": 4.08, + "learning_rate": 3.6549637557166415e-05, + "loss": 0.3012, + "step": 175500 + }, + { + "epoch": 4.09, + "learning_rate": 3.651074884111627e-05, + "loss": 0.3094, + "step": 176000 + }, + { + "epoch": 4.1, + "learning_rate": 3.647186012506612e-05, + "loss": 0.3069, + "step": 176500 + }, + { + "epoch": 4.11, + "learning_rate": 3.6432971409015956e-05, + "loss": 0.3091, + "step": 177000 + }, + { + "epoch": 4.13, + "learning_rate": 3.639408269296581e-05, + "loss": 0.3147, + "step": 177500 + }, + { + "epoch": 4.14, + "learning_rate": 3.635519397691566e-05, + "loss": 0.3112, + "step": 178000 + }, + { + "epoch": 4.15, + "learning_rate": 3.6316305260865504e-05, + "loss": 0.3129, + "step": 178500 + }, + { + "epoch": 4.16, + "learning_rate": 3.6277416544815356e-05, + "loss": 0.309, + "step": 179000 + }, + { + "epoch": 4.17, + "learning_rate": 3.623852782876521e-05, + "loss": 0.3195, + "step": 179500 + }, + { + "epoch": 4.18, + "learning_rate": 3.619963911271506e-05, + "loss": 0.3219, + "step": 180000 + }, + { + "epoch": 4.2, + "learning_rate": 3.6160750396664903e-05, + "loss": 0.3134, + "step": 180500 + }, + { + "epoch": 4.21, + "learning_rate": 3.6121861680614755e-05, + "loss": 0.3182, + "step": 181000 + }, + { + "epoch": 4.22, + "learning_rate": 3.6083050741996705e-05, + "loss": 0.3131, + "step": 181500 + }, + { + "epoch": 4.23, + "learning_rate": 3.6044162025946556e-05, + "loss": 0.3193, + "step": 182000 + }, + { + "epoch": 4.24, + "learning_rate": 3.6005351087328506e-05, + "loss": 0.3153, + "step": 182500 + }, + { + "epoch": 4.25, + "learning_rate": 3.596646237127835e-05, + "loss": 0.3264, + "step": 183000 + }, + { + "epoch": 4.27, + "learning_rate": 3.59276514326603e-05, + "loss": 0.3195, + "step": 183500 + }, + { + "epoch": 4.28, + "learning_rate": 3.588876271661015e-05, + "loss": 0.3197, + "step": 184000 + }, + { + "epoch": 4.29, + "learning_rate": 3.5849874000559996e-05, + "loss": 0.3235, + "step": 184500 + }, + { + "epoch": 4.3, + "learning_rate": 3.581098528450985e-05, + "loss": 0.3188, + "step": 185000 + }, + { + "epoch": 4.31, + "learning_rate": 3.577209656845969e-05, + "loss": 0.3183, + "step": 185500 + }, + { + "epoch": 4.32, + "learning_rate": 3.5733207852409544e-05, + "loss": 0.3271, + "step": 186000 + }, + { + "epoch": 4.33, + "learning_rate": 3.5694319136359396e-05, + "loss": 0.3333, + "step": 186500 + }, + { + "epoch": 4.35, + "learning_rate": 3.565543042030924e-05, + "loss": 0.3216, + "step": 187000 + }, + { + "epoch": 4.36, + "learning_rate": 3.561654170425909e-05, + "loss": 0.3296, + "step": 187500 + }, + { + "epoch": 4.37, + "learning_rate": 3.5577652988208944e-05, + "loss": 0.3282, + "step": 188000 + }, + { + "epoch": 4.38, + "learning_rate": 3.5538764272158795e-05, + "loss": 0.3297, + "step": 188500 + }, + { + "epoch": 4.39, + "learning_rate": 3.549987555610864e-05, + "loss": 0.3283, + "step": 189000 + }, + { + "epoch": 4.4, + "learning_rate": 3.546106461749059e-05, + "loss": 0.3294, + "step": 189500 + }, + { + "epoch": 4.42, + "learning_rate": 3.542217590144044e-05, + "loss": 0.3275, + "step": 190000 + }, + { + "epoch": 4.43, + "learning_rate": 3.538328718539029e-05, + "loss": 0.324, + "step": 190500 + }, + { + "epoch": 4.44, + "learning_rate": 3.534439846934014e-05, + "loss": 0.3285, + "step": 191000 + }, + { + "epoch": 4.45, + "learning_rate": 3.530558753072209e-05, + "loss": 0.3338, + "step": 191500 + }, + { + "epoch": 4.46, + "learning_rate": 3.526669881467194e-05, + "loss": 0.3347, + "step": 192000 + }, + { + "epoch": 4.47, + "learning_rate": 3.522781009862178e-05, + "loss": 0.338, + "step": 192500 + }, + { + "epoch": 4.49, + "learning_rate": 3.5188921382571635e-05, + "loss": 0.3312, + "step": 193000 + }, + { + "epoch": 4.5, + "learning_rate": 3.515003266652148e-05, + "loss": 0.3364, + "step": 193500 + }, + { + "epoch": 4.51, + "learning_rate": 3.511114395047133e-05, + "loss": 0.3347, + "step": 194000 + }, + { + "epoch": 4.52, + "learning_rate": 3.507225523442118e-05, + "loss": 0.3334, + "step": 194500 + }, + { + "epoch": 4.53, + "learning_rate": 3.503336651837103e-05, + "loss": 0.3319, + "step": 195000 + }, + { + "epoch": 4.54, + "learning_rate": 3.499455557975298e-05, + "loss": 0.3332, + "step": 195500 + }, + { + "epoch": 4.56, + "learning_rate": 3.495566686370283e-05, + "loss": 0.3418, + "step": 196000 + }, + { + "epoch": 4.57, + "learning_rate": 3.491677814765268e-05, + "loss": 0.3402, + "step": 196500 + }, + { + "epoch": 4.58, + "learning_rate": 3.487796720903463e-05, + "loss": 0.3342, + "step": 197000 + }, + { + "epoch": 4.59, + "learning_rate": 3.483907849298448e-05, + "loss": 0.3294, + "step": 197500 + }, + { + "epoch": 4.6, + "learning_rate": 3.4800189776934326e-05, + "loss": 0.3393, + "step": 198000 + }, + { + "epoch": 4.61, + "learning_rate": 3.476130106088418e-05, + "loss": 0.3446, + "step": 198500 + }, + { + "epoch": 4.63, + "learning_rate": 3.472241234483403e-05, + "loss": 0.3375, + "step": 199000 + }, + { + "epoch": 4.64, + "learning_rate": 3.4683523628783874e-05, + "loss": 0.3399, + "step": 199500 + }, + { + "epoch": 4.65, + "learning_rate": 3.4644634912733725e-05, + "loss": 0.3405, + "step": 200000 + }, + { + "epoch": 4.66, + "learning_rate": 3.460574619668357e-05, + "loss": 0.3378, + "step": 200500 + }, + { + "epoch": 4.67, + "learning_rate": 3.456693525806552e-05, + "loss": 0.346, + "step": 201000 + }, + { + "epoch": 4.68, + "learning_rate": 3.452804654201537e-05, + "loss": 0.3455, + "step": 201500 + }, + { + "epoch": 4.7, + "learning_rate": 3.4489157825965216e-05, + "loss": 0.3432, + "step": 202000 + }, + { + "epoch": 4.71, + "learning_rate": 3.4450346887347166e-05, + "loss": 0.3505, + "step": 202500 + }, + { + "epoch": 4.72, + "learning_rate": 3.441145817129702e-05, + "loss": 0.3432, + "step": 203000 + }, + { + "epoch": 4.73, + "learning_rate": 3.437256945524687e-05, + "loss": 0.3405, + "step": 203500 + }, + { + "epoch": 4.74, + "learning_rate": 3.433368073919671e-05, + "loss": 0.3394, + "step": 204000 + }, + { + "epoch": 4.75, + "learning_rate": 3.4294792023146565e-05, + "loss": 0.3477, + "step": 204500 + }, + { + "epoch": 4.76, + "learning_rate": 3.4255903307096416e-05, + "loss": 0.352, + "step": 205000 + }, + { + "epoch": 4.78, + "learning_rate": 3.421701459104627e-05, + "loss": 0.3442, + "step": 205500 + }, + { + "epoch": 4.79, + "learning_rate": 3.417812587499611e-05, + "loss": 0.3487, + "step": 206000 + }, + { + "epoch": 4.8, + "learning_rate": 3.4139237158945964e-05, + "loss": 0.346, + "step": 206500 + }, + { + "epoch": 4.81, + "learning_rate": 3.4100348442895816e-05, + "loss": 0.3533, + "step": 207000 + }, + { + "epoch": 4.82, + "learning_rate": 3.406145972684566e-05, + "loss": 0.3501, + "step": 207500 + }, + { + "epoch": 4.83, + "learning_rate": 3.4022571010795505e-05, + "loss": 0.3472, + "step": 208000 + }, + { + "epoch": 4.85, + "learning_rate": 3.398368229474536e-05, + "loss": 0.351, + "step": 208500 + }, + { + "epoch": 4.86, + "learning_rate": 3.3944871356127306e-05, + "loss": 0.3415, + "step": 209000 + }, + { + "epoch": 4.87, + "learning_rate": 3.390598264007716e-05, + "loss": 0.3478, + "step": 209500 + }, + { + "epoch": 4.88, + "learning_rate": 3.386717170145911e-05, + "loss": 0.3475, + "step": 210000 + }, + { + "epoch": 4.89, + "learning_rate": 3.382828298540895e-05, + "loss": 0.3486, + "step": 210500 + }, + { + "epoch": 4.9, + "learning_rate": 3.3789394269358804e-05, + "loss": 0.3564, + "step": 211000 + }, + { + "epoch": 4.92, + "learning_rate": 3.3750505553308655e-05, + "loss": 0.3484, + "step": 211500 + }, + { + "epoch": 4.93, + "learning_rate": 3.37116168372585e-05, + "loss": 0.3541, + "step": 212000 + }, + { + "epoch": 4.94, + "learning_rate": 3.367280589864045e-05, + "loss": 0.3464, + "step": 212500 + }, + { + "epoch": 4.95, + "learning_rate": 3.36339949600224e-05, + "loss": 0.35, + "step": 213000 + }, + { + "epoch": 4.96, + "learning_rate": 3.359510624397225e-05, + "loss": 0.3475, + "step": 213500 + }, + { + "epoch": 4.97, + "learning_rate": 3.35562175279221e-05, + "loss": 0.3479, + "step": 214000 + }, + { + "epoch": 4.99, + "learning_rate": 3.3517328811871954e-05, + "loss": 0.3556, + "step": 214500 + }, + { + "epoch": 5.0, + "learning_rate": 3.34784400958218e-05, + "loss": 0.3504, + "step": 215000 + }, + { + "epoch": 5.0, + "eval_bleu": 59.352, + "eval_gen_len": 15.7454, + "eval_loss": 0.7117229700088501, + "eval_runtime": 6592.0433, + "eval_samples_per_second": 13.033, + "eval_steps_per_second": 1.629, + "step": 215120 + } + ], + "logging_steps": 500, + "max_steps": 645360, + "num_input_tokens_seen": 0, + "num_train_epochs": 15, + "save_steps": 500, + "total_flos": 7.459087290854277e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-215120/training_args.bin b/checkpoint-215120/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..c71a9dade62e1afff1c0282f02a162933d87afd7 --- /dev/null +++ b/checkpoint-215120/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64d80f6465473a02dd91018cea5b0675845684097cfa8269a7e746c79607dbe1 +size 4856 diff --git a/checkpoint-258144/config.json b/checkpoint-258144/config.json new file mode 100644 index 0000000000000000000000000000000000000000..77b8612fc6bf9e6aaa7560c8bcce3e4c9c6986de --- /dev/null +++ b/checkpoint-258144/config.json @@ -0,0 +1,59 @@ +{ + "_name_or_path": "facebook/mbart-large-50-many-to-many-mmt", + "_num_labels": 3, + "activation_dropout": 0.0, + "activation_function": "relu", + "add_bias_logits": false, + "add_final_layer_norm": true, + "architectures": [ + "MBartForConditionalGeneration" + ], + "attention_dropout": 0.0, + "bos_token_id": 0, + "classif_dropout": 0.0, + "classifier_dropout": 0.0, + "d_model": 1024, + "decoder_attention_heads": 16, + "decoder_ffn_dim": 4096, + "decoder_layerdrop": 0.0, + "decoder_layers": 12, + "decoder_start_token_id": 2, + "dropout": 0.1, + "early_stopping": true, + "encoder_attention_heads": 16, + "encoder_ffn_dim": 4096, + "encoder_layerdrop": 0.0, + "encoder_layers": 12, + "eos_token_id": 2, + "forced_bos_token_id": 250014, + "forced_eos_token_id": 2, + "gradient_checkpointing": false, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1", + "2": "LABEL_2" + }, + "init_std": 0.02, + "is_encoder_decoder": true, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1, + "LABEL_2": 2 + }, + "max_length": 200, + "max_position_embeddings": 1024, + "model_type": "mbart", + "normalize_before": true, + "normalize_embedding": true, + "num_beams": 5, + "num_hidden_layers": 12, + "output_past": true, + "pad_token_id": 1, + "scale_embedding": true, + "static_position_embeddings": false, + "tokenizer_class": "MBart50Tokenizer", + "torch_dtype": "float32", + "transformers_version": "4.37.2", + "use_cache": true, + "vocab_size": 250054 +} diff --git a/checkpoint-258144/generation_config.json b/checkpoint-258144/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b1d0b25bc2cc841a32d579c94d80eb6323756b7a --- /dev/null +++ b/checkpoint-258144/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 0, + "decoder_start_token_id": 2, + "early_stopping": true, + "eos_token_id": 2, + "forced_bos_token_id": 250014, + "forced_eos_token_id": 2, + "max_length": 200, + "num_beams": 5, + "pad_token_id": 1, + "transformers_version": "4.37.2" +} diff --git a/checkpoint-258144/model.safetensors b/checkpoint-258144/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..394da6ba575f1c8762f6e9dc3bcfb134273b19b8 --- /dev/null +++ b/checkpoint-258144/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ef21573413e6bbc7e44e1f4806f382c99aba4dfac701eb30c08b350afeff274 +size 2444578688 diff --git a/checkpoint-258144/optimizer.pt b/checkpoint-258144/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d61738f2d273fef92f0d59d4b8365571b06d9183 --- /dev/null +++ b/checkpoint-258144/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfc24d53d2c2aeef603b08c1ea1cdb1007478254607df26755dcb91b8b161f7b +size 4887473903 diff --git a/checkpoint-258144/rng_state_0.pth b/checkpoint-258144/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..34b7f77f4f3331af337cfdf2426802dde23739b9 --- /dev/null +++ b/checkpoint-258144/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6124adb6cd9ff553c56324f6812f71d53613e82d2048e2459cb8a50bf81bd4bb +size 15024 diff --git a/checkpoint-258144/rng_state_1.pth b/checkpoint-258144/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..5dfebdf1cadef62b0d80b7f82e2e557e29bbf479 --- /dev/null +++ b/checkpoint-258144/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e72818b75fd4990182c6aeadc10ff37fce860cac03d62dd7e09b32c5fd68a945 +size 15024 diff --git a/checkpoint-258144/rng_state_2.pth b/checkpoint-258144/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..1217f4a7bfca3dabf5542758bbbffcc7f8d34ba2 --- /dev/null +++ b/checkpoint-258144/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7bc62eb28912b7a435a84936388dfb85c4979c82aa4159ab7344380d3ed0bcb +size 15024 diff --git a/checkpoint-258144/rng_state_3.pth b/checkpoint-258144/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..11498698e46b10af74f160dd0fb544b1a6db2743 --- /dev/null +++ b/checkpoint-258144/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5db5e2e2e56df076b9ff1a2259770f014e69778708fcac7ae8d4bc6929b03a4 +size 15024 diff --git a/checkpoint-258144/scheduler.pt b/checkpoint-258144/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..d25f23eddd11875be0f21e82776c12a6634cd79b --- /dev/null +++ b/checkpoint-258144/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38ad8b67f4cba7800edbc8c08f7656746fc6c862ec6662bd0f3db409a5e9279f +size 1064 diff --git a/checkpoint-258144/sentencepiece.bpe.model b/checkpoint-258144/sentencepiece.bpe.model new file mode 100644 index 0000000000000000000000000000000000000000..7a3f40a75f870bc1f21700cd414dc2acc431583c --- /dev/null +++ b/checkpoint-258144/sentencepiece.bpe.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865 +size 5069051 diff --git a/checkpoint-258144/special_tokens_map.json b/checkpoint-258144/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..92619141640d5fcbb4429807de2248352b0dca79 --- /dev/null +++ b/checkpoint-258144/special_tokens_map.json @@ -0,0 +1,69 @@ +{ + "additional_special_tokens": [ + "ar_AR", + "cs_CZ", + "de_DE", + "en_XX", + "es_XX", + "et_EE", + "fi_FI", + "fr_XX", + "gu_IN", + "hi_IN", + "it_IT", + "ja_XX", + "kk_KZ", + "ko_KR", + "lt_LT", + "lv_LV", + "my_MM", + "ne_NP", + "nl_XX", + "ro_RO", + "ru_RU", + "si_LK", + "tr_TR", + "vi_VN", + "zh_CN", + "af_ZA", + "az_AZ", + "bn_IN", + "fa_IR", + "he_IL", + "hr_HR", + "id_ID", + "ka_GE", + "km_KH", + "mk_MK", + "ml_IN", + "mn_MN", + "mr_IN", + "pl_PL", + "ps_AF", + "pt_XX", + "sv_SE", + "sw_KE", + "ta_IN", + "te_IN", + "th_TH", + "tl_XX", + "uk_UA", + "ur_PK", + "xh_ZA", + "gl_ES", + "sl_SI" + ], + "bos_token": "", + "cls_token": "", + "eos_token": "", + "mask_token": { + "content": "", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "sep_token": "", + "unk_token": "" +} diff --git a/checkpoint-258144/tokenizer.json b/checkpoint-258144/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..ecc6a4f3075bc2a01607c72e81fd24456ab68311 --- /dev/null +++ b/checkpoint-258144/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfb9b1f3e7ce9f6c1a5ab4560578eda3329db396be400909c5d34c8d0b08b0ed +size 17110208 diff --git a/checkpoint-258144/tokenizer_config.json b/checkpoint-258144/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..70c0515a5815fcc727e11e053116348bfac12128 --- /dev/null +++ b/checkpoint-258144/tokenizer_config.json @@ -0,0 +1,528 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250001": { + "content": "ar_AR", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250002": { + "content": "cs_CZ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250003": { + "content": "de_DE", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250004": { + "content": "en_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250005": { + "content": "es_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250006": { + "content": "et_EE", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250007": { + "content": "fi_FI", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250008": { + "content": "fr_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250009": { + "content": "gu_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250010": { + "content": "hi_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250011": { + "content": "it_IT", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250012": { + "content": "ja_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250013": { + "content": "kk_KZ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250014": { + "content": "ko_KR", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250015": { + "content": "lt_LT", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250016": { + "content": "lv_LV", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250017": { + "content": "my_MM", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250018": { + "content": "ne_NP", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250019": { + "content": "nl_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250020": { + "content": "ro_RO", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250021": { + "content": "ru_RU", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250022": { + "content": "si_LK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250023": { + "content": "tr_TR", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250024": { + "content": "vi_VN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250025": { + "content": "zh_CN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250026": { + "content": "af_ZA", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250027": { + "content": "az_AZ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250028": { + "content": "bn_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250029": { + "content": "fa_IR", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250030": { + "content": "he_IL", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250031": { + "content": "hr_HR", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250032": { + "content": "id_ID", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250033": { + "content": "ka_GE", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250034": { + "content": "km_KH", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250035": { + "content": "mk_MK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250036": { + "content": "ml_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250037": { + "content": "mn_MN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250038": { + "content": "mr_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250039": { + "content": "pl_PL", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250040": { + "content": "ps_AF", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250041": { + "content": "pt_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250042": { + "content": "sv_SE", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250043": { + "content": "sw_KE", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250044": { + "content": "ta_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250045": { + "content": "te_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250046": { + "content": "th_TH", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250047": { + "content": "tl_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250048": { + "content": "uk_UA", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250049": { + "content": "ur_PK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250050": { + "content": "xh_ZA", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250051": { + "content": "gl_ES", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250052": { + "content": "sl_SI", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250053": { + "content": "", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "ar_AR", + "cs_CZ", + "de_DE", + "en_XX", + "es_XX", + "et_EE", + "fi_FI", + "fr_XX", + "gu_IN", + "hi_IN", + "it_IT", + "ja_XX", + "kk_KZ", + "ko_KR", + "lt_LT", + "lv_LV", + "my_MM", + "ne_NP", + "nl_XX", + "ro_RO", + "ru_RU", + "si_LK", + "tr_TR", + "vi_VN", + "zh_CN", + "af_ZA", + "az_AZ", + "bn_IN", + "fa_IR", + "he_IL", + "hr_HR", + "id_ID", + "ka_GE", + "km_KH", + "mk_MK", + "ml_IN", + "mn_MN", + "mr_IN", + "pl_PL", + "ps_AF", + "pt_XX", + "sv_SE", + "sw_KE", + "ta_IN", + "te_IN", + "th_TH", + "tl_XX", + "uk_UA", + "ur_PK", + "xh_ZA", + "gl_ES", + "sl_SI" + ], + "bos_token": "", + "clean_up_tokenization_spaces": true, + "cls_token": "", + "eos_token": "", + "language_codes": "ML50", + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sep_token": "", + "sp_model_kwargs": {}, + "src_lang": "ja_XX", + "tgt_lang": "ko_KR", + "tokenizer_class": "MBart50Tokenizer", + "unk_token": "" +} diff --git a/checkpoint-258144/trainer_state.json b/checkpoint-258144/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6f80d073ce3b22825d0b39181e48f9467f9d0727 --- /dev/null +++ b/checkpoint-258144/trainer_state.json @@ -0,0 +1,3177 @@ +{ + "best_metric": 0.6417234539985657, + "best_model_checkpoint": "./enko_mbartLarge_100p_sup2/checkpoint-129072", + "epoch": 6.0, + "eval_steps": 500, + "global_step": 258144, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 9.940000000000001e-06, + "loss": 2.2901, + "step": 500 + }, + { + "epoch": 0.02, + "learning_rate": 1.992e-05, + "loss": 1.6084, + "step": 1000 + }, + { + "epoch": 0.03, + "learning_rate": 2.9920000000000005e-05, + "loss": 1.449, + "step": 1500 + }, + { + "epoch": 0.05, + "learning_rate": 3.9920000000000004e-05, + "loss": 1.3874, + "step": 2000 + }, + { + "epoch": 0.06, + "learning_rate": 4.992e-05, + "loss": 1.3485, + "step": 2500 + }, + { + "epoch": 0.07, + "learning_rate": 4.996142239367825e-05, + "loss": 1.2926, + "step": 3000 + }, + { + "epoch": 0.08, + "learning_rate": 4.9922533677628105e-05, + "loss": 1.248, + "step": 3500 + }, + { + "epoch": 0.09, + "learning_rate": 4.988364496157795e-05, + "loss": 1.1931, + "step": 4000 + }, + { + "epoch": 0.1, + "learning_rate": 4.98447562455278e-05, + "loss": 1.1677, + "step": 4500 + }, + { + "epoch": 0.12, + "learning_rate": 4.980594530690975e-05, + "loss": 1.142, + "step": 5000 + }, + { + "epoch": 0.13, + "learning_rate": 4.97670565908596e-05, + "loss": 1.122, + "step": 5500 + }, + { + "epoch": 0.14, + "learning_rate": 4.972816787480945e-05, + "loss": 1.0855, + "step": 6000 + }, + { + "epoch": 0.15, + "learning_rate": 4.968927915875929e-05, + "loss": 1.0719, + "step": 6500 + }, + { + "epoch": 0.16, + "learning_rate": 4.965039044270914e-05, + "loss": 1.053, + "step": 7000 + }, + { + "epoch": 0.17, + "learning_rate": 4.9611501726658995e-05, + "loss": 1.0359, + "step": 7500 + }, + { + "epoch": 0.19, + "learning_rate": 4.957261301060884e-05, + "loss": 1.0232, + "step": 8000 + }, + { + "epoch": 0.2, + "learning_rate": 4.953380207199079e-05, + "loss": 1.0169, + "step": 8500 + }, + { + "epoch": 0.21, + "learning_rate": 4.949491335594064e-05, + "loss": 1.0141, + "step": 9000 + }, + { + "epoch": 0.22, + "learning_rate": 4.945602463989049e-05, + "loss": 0.9855, + "step": 9500 + }, + { + "epoch": 0.23, + "learning_rate": 4.941721370127244e-05, + "loss": 0.9932, + "step": 10000 + }, + { + "epoch": 0.24, + "learning_rate": 4.9378324985222293e-05, + "loss": 0.9656, + "step": 10500 + }, + { + "epoch": 0.26, + "learning_rate": 4.933943626917214e-05, + "loss": 0.9694, + "step": 11000 + }, + { + "epoch": 0.27, + "learning_rate": 4.930054755312199e-05, + "loss": 0.9609, + "step": 11500 + }, + { + "epoch": 0.28, + "learning_rate": 4.926165883707184e-05, + "loss": 0.9526, + "step": 12000 + }, + { + "epoch": 0.29, + "learning_rate": 4.9222770121021686e-05, + "loss": 0.9359, + "step": 12500 + }, + { + "epoch": 0.3, + "learning_rate": 4.9183959182403636e-05, + "loss": 0.9324, + "step": 13000 + }, + { + "epoch": 0.31, + "learning_rate": 4.914507046635349e-05, + "loss": 0.9251, + "step": 13500 + }, + { + "epoch": 0.33, + "learning_rate": 4.910625952773543e-05, + "loss": 0.9216, + "step": 14000 + }, + { + "epoch": 0.34, + "learning_rate": 4.906737081168528e-05, + "loss": 0.9181, + "step": 14500 + }, + { + "epoch": 0.35, + "learning_rate": 4.902855987306723e-05, + "loss": 0.9098, + "step": 15000 + }, + { + "epoch": 0.36, + "learning_rate": 4.898967115701708e-05, + "loss": 0.9067, + "step": 15500 + }, + { + "epoch": 0.37, + "learning_rate": 4.895078244096693e-05, + "loss": 0.8931, + "step": 16000 + }, + { + "epoch": 0.38, + "learning_rate": 4.891189372491678e-05, + "loss": 0.8923, + "step": 16500 + }, + { + "epoch": 0.4, + "learning_rate": 4.887300500886663e-05, + "loss": 0.8703, + "step": 17000 + }, + { + "epoch": 0.41, + "learning_rate": 4.8834116292816475e-05, + "loss": 0.8861, + "step": 17500 + }, + { + "epoch": 0.42, + "learning_rate": 4.879522757676633e-05, + "loss": 0.8864, + "step": 18000 + }, + { + "epoch": 0.43, + "learning_rate": 4.875633886071618e-05, + "loss": 0.886, + "step": 18500 + }, + { + "epoch": 0.44, + "learning_rate": 4.871752792209813e-05, + "loss": 0.8737, + "step": 19000 + }, + { + "epoch": 0.45, + "learning_rate": 4.867863920604798e-05, + "loss": 0.8708, + "step": 19500 + }, + { + "epoch": 0.46, + "learning_rate": 4.8639750489997824e-05, + "loss": 0.865, + "step": 20000 + }, + { + "epoch": 0.48, + "learning_rate": 4.8600861773947676e-05, + "loss": 0.8571, + "step": 20500 + }, + { + "epoch": 0.49, + "learning_rate": 4.856197305789753e-05, + "loss": 0.8621, + "step": 21000 + }, + { + "epoch": 0.5, + "learning_rate": 4.852308434184737e-05, + "loss": 0.8607, + "step": 21500 + }, + { + "epoch": 0.51, + "learning_rate": 4.848419562579722e-05, + "loss": 0.8519, + "step": 22000 + }, + { + "epoch": 0.52, + "learning_rate": 4.844530690974707e-05, + "loss": 0.8472, + "step": 22500 + }, + { + "epoch": 0.53, + "learning_rate": 4.840641819369692e-05, + "loss": 0.8381, + "step": 23000 + }, + { + "epoch": 0.55, + "learning_rate": 4.8367529477646765e-05, + "loss": 0.8329, + "step": 23500 + }, + { + "epoch": 0.56, + "learning_rate": 4.8328640761596616e-05, + "loss": 0.8425, + "step": 24000 + }, + { + "epoch": 0.57, + "learning_rate": 4.828975204554647e-05, + "loss": 0.8227, + "step": 24500 + }, + { + "epoch": 0.58, + "learning_rate": 4.825086332949631e-05, + "loss": 0.8349, + "step": 25000 + }, + { + "epoch": 0.59, + "learning_rate": 4.8211974613446164e-05, + "loss": 0.8235, + "step": 25500 + }, + { + "epoch": 0.6, + "learning_rate": 4.8173085897396015e-05, + "loss": 0.8116, + "step": 26000 + }, + { + "epoch": 0.62, + "learning_rate": 4.813419718134587e-05, + "loss": 0.8367, + "step": 26500 + }, + { + "epoch": 0.63, + "learning_rate": 4.809530846529571e-05, + "loss": 0.8219, + "step": 27000 + }, + { + "epoch": 0.64, + "learning_rate": 4.805641974924556e-05, + "loss": 0.8206, + "step": 27500 + }, + { + "epoch": 0.65, + "learning_rate": 4.8017531033195415e-05, + "loss": 0.8199, + "step": 28000 + }, + { + "epoch": 0.66, + "learning_rate": 4.797864231714526e-05, + "loss": 0.816, + "step": 28500 + }, + { + "epoch": 0.67, + "learning_rate": 4.793983137852721e-05, + "loss": 0.8144, + "step": 29000 + }, + { + "epoch": 0.69, + "learning_rate": 4.790102043990916e-05, + "loss": 0.806, + "step": 29500 + }, + { + "epoch": 0.7, + "learning_rate": 4.786220950129111e-05, + "loss": 0.8134, + "step": 30000 + }, + { + "epoch": 0.71, + "learning_rate": 4.782332078524095e-05, + "loss": 0.8109, + "step": 30500 + }, + { + "epoch": 0.72, + "learning_rate": 4.7784432069190805e-05, + "loss": 0.792, + "step": 31000 + }, + { + "epoch": 0.73, + "learning_rate": 4.7745543353140656e-05, + "loss": 0.8046, + "step": 31500 + }, + { + "epoch": 0.74, + "learning_rate": 4.77066546370905e-05, + "loss": 0.7983, + "step": 32000 + }, + { + "epoch": 0.76, + "learning_rate": 4.766776592104035e-05, + "loss": 0.7937, + "step": 32500 + }, + { + "epoch": 0.77, + "learning_rate": 4.7628877204990204e-05, + "loss": 0.7978, + "step": 33000 + }, + { + "epoch": 0.78, + "learning_rate": 4.758998848894005e-05, + "loss": 0.7961, + "step": 33500 + }, + { + "epoch": 0.79, + "learning_rate": 4.75510997728899e-05, + "loss": 0.7854, + "step": 34000 + }, + { + "epoch": 0.8, + "learning_rate": 4.751221105683975e-05, + "loss": 0.8006, + "step": 34500 + }, + { + "epoch": 0.81, + "learning_rate": 4.7473322340789597e-05, + "loss": 0.7857, + "step": 35000 + }, + { + "epoch": 0.83, + "learning_rate": 4.743443362473945e-05, + "loss": 0.7857, + "step": 35500 + }, + { + "epoch": 0.84, + "learning_rate": 4.73956226861214e-05, + "loss": 0.7874, + "step": 36000 + }, + { + "epoch": 0.85, + "learning_rate": 4.735673397007125e-05, + "loss": 0.7826, + "step": 36500 + }, + { + "epoch": 0.86, + "learning_rate": 4.73178452540211e-05, + "loss": 0.7902, + "step": 37000 + }, + { + "epoch": 0.87, + "learning_rate": 4.727895653797094e-05, + "loss": 0.7695, + "step": 37500 + }, + { + "epoch": 0.88, + "learning_rate": 4.724006782192079e-05, + "loss": 0.7789, + "step": 38000 + }, + { + "epoch": 0.89, + "learning_rate": 4.720117910587064e-05, + "loss": 0.7739, + "step": 38500 + }, + { + "epoch": 0.91, + "learning_rate": 4.716229038982049e-05, + "loss": 0.7726, + "step": 39000 + }, + { + "epoch": 0.92, + "learning_rate": 4.712340167377034e-05, + "loss": 0.7722, + "step": 39500 + }, + { + "epoch": 0.93, + "learning_rate": 4.708451295772019e-05, + "loss": 0.7656, + "step": 40000 + }, + { + "epoch": 0.94, + "learning_rate": 4.704562424167004e-05, + "loss": 0.77, + "step": 40500 + }, + { + "epoch": 0.95, + "learning_rate": 4.700689108048409e-05, + "loss": 0.7814, + "step": 41000 + }, + { + "epoch": 0.96, + "learning_rate": 4.696800236443394e-05, + "loss": 0.7704, + "step": 41500 + }, + { + "epoch": 0.98, + "learning_rate": 4.692919142581589e-05, + "loss": 0.7611, + "step": 42000 + }, + { + "epoch": 0.99, + "learning_rate": 4.689038048719784e-05, + "loss": 0.7629, + "step": 42500 + }, + { + "epoch": 1.0, + "learning_rate": 4.6851491771147685e-05, + "loss": 0.7676, + "step": 43000 + }, + { + "epoch": 1.0, + "eval_bleu": 55.2526, + "eval_gen_len": 16.382, + "eval_loss": 0.7125306129455566, + "eval_runtime": 7352.67, + "eval_samples_per_second": 11.685, + "eval_steps_per_second": 1.461, + "step": 43024 + }, + { + "epoch": 1.01, + "learning_rate": 4.6812603055097536e-05, + "loss": 0.6434, + "step": 43500 + }, + { + "epoch": 1.02, + "learning_rate": 4.6773792116479486e-05, + "loss": 0.6436, + "step": 44000 + }, + { + "epoch": 1.03, + "learning_rate": 4.673490340042934e-05, + "loss": 0.651, + "step": 44500 + }, + { + "epoch": 1.05, + "learning_rate": 4.669601468437918e-05, + "loss": 0.6517, + "step": 45000 + }, + { + "epoch": 1.06, + "learning_rate": 4.665712596832903e-05, + "loss": 0.6416, + "step": 45500 + }, + { + "epoch": 1.07, + "learning_rate": 4.661823725227888e-05, + "loss": 0.6517, + "step": 46000 + }, + { + "epoch": 1.08, + "learning_rate": 4.657934853622873e-05, + "loss": 0.6539, + "step": 46500 + }, + { + "epoch": 1.09, + "learning_rate": 4.654045982017858e-05, + "loss": 0.6587, + "step": 47000 + }, + { + "epoch": 1.1, + "learning_rate": 4.6501571104128426e-05, + "loss": 0.6518, + "step": 47500 + }, + { + "epoch": 1.12, + "learning_rate": 4.646268238807828e-05, + "loss": 0.6465, + "step": 48000 + }, + { + "epoch": 1.13, + "learning_rate": 4.642379367202813e-05, + "loss": 0.6437, + "step": 48500 + }, + { + "epoch": 1.14, + "learning_rate": 4.6384904955977974e-05, + "loss": 0.6458, + "step": 49000 + }, + { + "epoch": 1.15, + "learning_rate": 4.6346016239927825e-05, + "loss": 0.6476, + "step": 49500 + }, + { + "epoch": 1.16, + "learning_rate": 4.630712752387768e-05, + "loss": 0.6587, + "step": 50000 + }, + { + "epoch": 1.17, + "learning_rate": 4.626823880782752e-05, + "loss": 0.6392, + "step": 50500 + }, + { + "epoch": 1.19, + "learning_rate": 4.622935009177737e-05, + "loss": 0.6547, + "step": 51000 + }, + { + "epoch": 1.2, + "learning_rate": 4.6190461375727225e-05, + "loss": 0.6511, + "step": 51500 + }, + { + "epoch": 1.21, + "learning_rate": 4.615157265967707e-05, + "loss": 0.6522, + "step": 52000 + }, + { + "epoch": 1.22, + "learning_rate": 4.611268394362692e-05, + "loss": 0.6555, + "step": 52500 + }, + { + "epoch": 1.23, + "learning_rate": 4.607395078244097e-05, + "loss": 0.6461, + "step": 53000 + }, + { + "epoch": 1.24, + "learning_rate": 4.6035062066390814e-05, + "loss": 0.6471, + "step": 53500 + }, + { + "epoch": 1.26, + "learning_rate": 4.5996173350340665e-05, + "loss": 0.6563, + "step": 54000 + }, + { + "epoch": 1.27, + "learning_rate": 4.5957284634290517e-05, + "loss": 0.6447, + "step": 54500 + }, + { + "epoch": 1.28, + "learning_rate": 4.591839591824036e-05, + "loss": 0.6483, + "step": 55000 + }, + { + "epoch": 1.29, + "learning_rate": 4.587958497962231e-05, + "loss": 0.6475, + "step": 55500 + }, + { + "epoch": 1.3, + "learning_rate": 4.584069626357216e-05, + "loss": 0.6546, + "step": 56000 + }, + { + "epoch": 1.31, + "learning_rate": 4.5801807547522014e-05, + "loss": 0.6528, + "step": 56500 + }, + { + "epoch": 1.32, + "learning_rate": 4.5762918831471866e-05, + "loss": 0.6538, + "step": 57000 + }, + { + "epoch": 1.34, + "learning_rate": 4.572403011542171e-05, + "loss": 0.6511, + "step": 57500 + }, + { + "epoch": 1.35, + "learning_rate": 4.568521917680366e-05, + "loss": 0.6457, + "step": 58000 + }, + { + "epoch": 1.36, + "learning_rate": 4.564633046075351e-05, + "loss": 0.659, + "step": 58500 + }, + { + "epoch": 1.37, + "learning_rate": 4.560744174470336e-05, + "loss": 0.6358, + "step": 59000 + }, + { + "epoch": 1.38, + "learning_rate": 4.556855302865321e-05, + "loss": 0.6425, + "step": 59500 + }, + { + "epoch": 1.39, + "learning_rate": 4.552966431260306e-05, + "loss": 0.644, + "step": 60000 + }, + { + "epoch": 1.41, + "learning_rate": 4.5490775596552904e-05, + "loss": 0.6537, + "step": 60500 + }, + { + "epoch": 1.42, + "learning_rate": 4.5451886880502756e-05, + "loss": 0.6529, + "step": 61000 + }, + { + "epoch": 1.43, + "learning_rate": 4.5413075941884705e-05, + "loss": 0.6547, + "step": 61500 + }, + { + "epoch": 1.44, + "learning_rate": 4.537418722583455e-05, + "loss": 0.6443, + "step": 62000 + }, + { + "epoch": 1.45, + "learning_rate": 4.53352985097844e-05, + "loss": 0.6516, + "step": 62500 + }, + { + "epoch": 1.46, + "learning_rate": 4.529640979373425e-05, + "loss": 0.6439, + "step": 63000 + }, + { + "epoch": 1.48, + "learning_rate": 4.52575210776841e-05, + "loss": 0.6401, + "step": 63500 + }, + { + "epoch": 1.49, + "learning_rate": 4.521863236163395e-05, + "loss": 0.6545, + "step": 64000 + }, + { + "epoch": 1.5, + "learning_rate": 4.51797436455838e-05, + "loss": 0.6433, + "step": 64500 + }, + { + "epoch": 1.51, + "learning_rate": 4.514085492953365e-05, + "loss": 0.6538, + "step": 65000 + }, + { + "epoch": 1.52, + "learning_rate": 4.51019662134835e-05, + "loss": 0.6548, + "step": 65500 + }, + { + "epoch": 1.53, + "learning_rate": 4.506307749743335e-05, + "loss": 0.6458, + "step": 66000 + }, + { + "epoch": 1.55, + "learning_rate": 4.50241887813832e-05, + "loss": 0.649, + "step": 66500 + }, + { + "epoch": 1.56, + "learning_rate": 4.4985300065333045e-05, + "loss": 0.6471, + "step": 67000 + }, + { + "epoch": 1.57, + "learning_rate": 4.4946411349282896e-05, + "loss": 0.6526, + "step": 67500 + }, + { + "epoch": 1.58, + "learning_rate": 4.4907600410664846e-05, + "loss": 0.646, + "step": 68000 + }, + { + "epoch": 1.59, + "learning_rate": 4.486878947204679e-05, + "loss": 0.6491, + "step": 68500 + }, + { + "epoch": 1.6, + "learning_rate": 4.482990075599664e-05, + "loss": 0.6484, + "step": 69000 + }, + { + "epoch": 1.62, + "learning_rate": 4.479101203994649e-05, + "loss": 0.6421, + "step": 69500 + }, + { + "epoch": 1.63, + "learning_rate": 4.475220110132844e-05, + "loss": 0.6498, + "step": 70000 + }, + { + "epoch": 1.64, + "learning_rate": 4.4713312385278286e-05, + "loss": 0.6497, + "step": 70500 + }, + { + "epoch": 1.65, + "learning_rate": 4.467442366922814e-05, + "loss": 0.6503, + "step": 71000 + }, + { + "epoch": 1.66, + "learning_rate": 4.463553495317799e-05, + "loss": 0.6314, + "step": 71500 + }, + { + "epoch": 1.67, + "learning_rate": 4.459672401455994e-05, + "loss": 0.6495, + "step": 72000 + }, + { + "epoch": 1.69, + "learning_rate": 4.4557835298509784e-05, + "loss": 0.643, + "step": 72500 + }, + { + "epoch": 1.7, + "learning_rate": 4.4518946582459635e-05, + "loss": 0.6542, + "step": 73000 + }, + { + "epoch": 1.71, + "learning_rate": 4.448005786640949e-05, + "loss": 0.6522, + "step": 73500 + }, + { + "epoch": 1.72, + "learning_rate": 4.444116915035934e-05, + "loss": 0.6476, + "step": 74000 + }, + { + "epoch": 1.73, + "learning_rate": 4.440228043430918e-05, + "loss": 0.6337, + "step": 74500 + }, + { + "epoch": 1.74, + "learning_rate": 4.4363391718259035e-05, + "loss": 0.6546, + "step": 75000 + }, + { + "epoch": 1.75, + "learning_rate": 4.4324503002208886e-05, + "loss": 0.647, + "step": 75500 + }, + { + "epoch": 1.77, + "learning_rate": 4.4285614286158724e-05, + "loss": 0.6467, + "step": 76000 + }, + { + "epoch": 1.78, + "learning_rate": 4.4246725570108576e-05, + "loss": 0.6432, + "step": 76500 + }, + { + "epoch": 1.79, + "learning_rate": 4.420783685405843e-05, + "loss": 0.6399, + "step": 77000 + }, + { + "epoch": 1.8, + "learning_rate": 4.416894813800828e-05, + "loss": 0.6382, + "step": 77500 + }, + { + "epoch": 1.81, + "learning_rate": 4.4130059421958123e-05, + "loss": 0.649, + "step": 78000 + }, + { + "epoch": 1.82, + "learning_rate": 4.409124848334007e-05, + "loss": 0.6385, + "step": 78500 + }, + { + "epoch": 1.84, + "learning_rate": 4.4052359767289925e-05, + "loss": 0.6448, + "step": 79000 + }, + { + "epoch": 1.85, + "learning_rate": 4.4013471051239776e-05, + "loss": 0.638, + "step": 79500 + }, + { + "epoch": 1.86, + "learning_rate": 4.397458233518962e-05, + "loss": 0.6317, + "step": 80000 + }, + { + "epoch": 1.87, + "learning_rate": 4.393569361913947e-05, + "loss": 0.6338, + "step": 80500 + }, + { + "epoch": 1.88, + "learning_rate": 4.3896804903089324e-05, + "loss": 0.6406, + "step": 81000 + }, + { + "epoch": 1.89, + "learning_rate": 4.385791618703917e-05, + "loss": 0.6363, + "step": 81500 + }, + { + "epoch": 1.91, + "learning_rate": 4.381902747098902e-05, + "loss": 0.6381, + "step": 82000 + }, + { + "epoch": 1.92, + "learning_rate": 4.378021653237097e-05, + "loss": 0.6383, + "step": 82500 + }, + { + "epoch": 1.93, + "learning_rate": 4.374140559375292e-05, + "loss": 0.6351, + "step": 83000 + }, + { + "epoch": 1.94, + "learning_rate": 4.370251687770277e-05, + "loss": 0.642, + "step": 83500 + }, + { + "epoch": 1.95, + "learning_rate": 4.3663705939084714e-05, + "loss": 0.6399, + "step": 84000 + }, + { + "epoch": 1.96, + "learning_rate": 4.3624817223034566e-05, + "loss": 0.6351, + "step": 84500 + }, + { + "epoch": 1.98, + "learning_rate": 4.358592850698442e-05, + "loss": 0.631, + "step": 85000 + }, + { + "epoch": 1.99, + "learning_rate": 4.354703979093426e-05, + "loss": 0.6298, + "step": 85500 + }, + { + "epoch": 2.0, + "learning_rate": 4.350822885231621e-05, + "loss": 0.6349, + "step": 86000 + }, + { + "epoch": 2.0, + "eval_bleu": 58.202, + "eval_gen_len": 15.9466, + "eval_loss": 0.6546894311904907, + "eval_runtime": 6844.8665, + "eval_samples_per_second": 12.552, + "eval_steps_per_second": 1.569, + "step": 86048 + }, + { + "epoch": 2.01, + "learning_rate": 4.346934013626606e-05, + "loss": 0.5093, + "step": 86500 + }, + { + "epoch": 2.02, + "learning_rate": 4.3430451420215914e-05, + "loss": 0.5066, + "step": 87000 + }, + { + "epoch": 2.03, + "learning_rate": 4.339156270416576e-05, + "loss": 0.4956, + "step": 87500 + }, + { + "epoch": 2.05, + "learning_rate": 4.335267398811561e-05, + "loss": 0.5, + "step": 88000 + }, + { + "epoch": 2.06, + "learning_rate": 4.331378527206546e-05, + "loss": 0.5034, + "step": 88500 + }, + { + "epoch": 2.07, + "learning_rate": 4.327489655601531e-05, + "loss": 0.505, + "step": 89000 + }, + { + "epoch": 2.08, + "learning_rate": 4.323600783996516e-05, + "loss": 0.4995, + "step": 89500 + }, + { + "epoch": 2.09, + "learning_rate": 4.319719690134711e-05, + "loss": 0.5064, + "step": 90000 + }, + { + "epoch": 2.1, + "learning_rate": 4.315830818529696e-05, + "loss": 0.5065, + "step": 90500 + }, + { + "epoch": 2.12, + "learning_rate": 4.311949724667891e-05, + "loss": 0.5031, + "step": 91000 + }, + { + "epoch": 2.13, + "learning_rate": 4.3080608530628754e-05, + "loss": 0.4967, + "step": 91500 + }, + { + "epoch": 2.14, + "learning_rate": 4.30417198145786e-05, + "loss": 0.5112, + "step": 92000 + }, + { + "epoch": 2.15, + "learning_rate": 4.300290887596055e-05, + "loss": 0.5069, + "step": 92500 + }, + { + "epoch": 2.16, + "learning_rate": 4.29640201599104e-05, + "loss": 0.5106, + "step": 93000 + }, + { + "epoch": 2.17, + "learning_rate": 4.292513144386025e-05, + "loss": 0.5077, + "step": 93500 + }, + { + "epoch": 2.18, + "learning_rate": 4.28862427278101e-05, + "loss": 0.5179, + "step": 94000 + }, + { + "epoch": 2.2, + "learning_rate": 4.284735401175995e-05, + "loss": 0.5192, + "step": 94500 + }, + { + "epoch": 2.21, + "learning_rate": 4.28084652957098e-05, + "loss": 0.5103, + "step": 95000 + }, + { + "epoch": 2.22, + "learning_rate": 4.276957657965965e-05, + "loss": 0.5203, + "step": 95500 + }, + { + "epoch": 2.23, + "learning_rate": 4.2730687863609496e-05, + "loss": 0.5179, + "step": 96000 + }, + { + "epoch": 2.24, + "learning_rate": 4.269179914755935e-05, + "loss": 0.515, + "step": 96500 + }, + { + "epoch": 2.25, + "learning_rate": 4.26529104315092e-05, + "loss": 0.5133, + "step": 97000 + }, + { + "epoch": 2.27, + "learning_rate": 4.2614021715459043e-05, + "loss": 0.5147, + "step": 97500 + }, + { + "epoch": 2.28, + "learning_rate": 4.2575132999408895e-05, + "loss": 0.5308, + "step": 98000 + }, + { + "epoch": 2.29, + "learning_rate": 4.2536322060790845e-05, + "loss": 0.5207, + "step": 98500 + }, + { + "epoch": 2.3, + "learning_rate": 4.2497511122172794e-05, + "loss": 0.5228, + "step": 99000 + }, + { + "epoch": 2.31, + "learning_rate": 4.245862240612264e-05, + "loss": 0.5207, + "step": 99500 + }, + { + "epoch": 2.32, + "learning_rate": 4.241973369007249e-05, + "loss": 0.5211, + "step": 100000 + }, + { + "epoch": 2.34, + "learning_rate": 4.2380844974022335e-05, + "loss": 0.5169, + "step": 100500 + }, + { + "epoch": 2.35, + "learning_rate": 4.234195625797219e-05, + "loss": 0.5302, + "step": 101000 + }, + { + "epoch": 2.36, + "learning_rate": 4.230306754192204e-05, + "loss": 0.5155, + "step": 101500 + }, + { + "epoch": 2.37, + "learning_rate": 4.226417882587188e-05, + "loss": 0.5193, + "step": 102000 + }, + { + "epoch": 2.38, + "learning_rate": 4.2225290109821735e-05, + "loss": 0.518, + "step": 102500 + }, + { + "epoch": 2.39, + "learning_rate": 4.2186401393771586e-05, + "loss": 0.5171, + "step": 103000 + }, + { + "epoch": 2.41, + "learning_rate": 4.214751267772144e-05, + "loss": 0.5208, + "step": 103500 + }, + { + "epoch": 2.42, + "learning_rate": 4.210862396167128e-05, + "loss": 0.5231, + "step": 104000 + }, + { + "epoch": 2.43, + "learning_rate": 4.2069735245621134e-05, + "loss": 0.5267, + "step": 104500 + }, + { + "epoch": 2.44, + "learning_rate": 4.2030846529570985e-05, + "loss": 0.5196, + "step": 105000 + }, + { + "epoch": 2.45, + "learning_rate": 4.199195781352083e-05, + "loss": 0.5234, + "step": 105500 + }, + { + "epoch": 2.46, + "learning_rate": 4.1953224652334885e-05, + "loss": 0.526, + "step": 106000 + }, + { + "epoch": 2.48, + "learning_rate": 4.191449149114893e-05, + "loss": 0.5312, + "step": 106500 + }, + { + "epoch": 2.49, + "learning_rate": 4.187560277509878e-05, + "loss": 0.5286, + "step": 107000 + }, + { + "epoch": 2.5, + "learning_rate": 4.183671405904863e-05, + "loss": 0.5225, + "step": 107500 + }, + { + "epoch": 2.51, + "learning_rate": 4.1797825342998474e-05, + "loss": 0.5207, + "step": 108000 + }, + { + "epoch": 2.52, + "learning_rate": 4.1758936626948325e-05, + "loss": 0.5263, + "step": 108500 + }, + { + "epoch": 2.53, + "learning_rate": 4.172004791089818e-05, + "loss": 0.5243, + "step": 109000 + }, + { + "epoch": 2.55, + "learning_rate": 4.1681236972280126e-05, + "loss": 0.5332, + "step": 109500 + }, + { + "epoch": 2.56, + "learning_rate": 4.164234825622997e-05, + "loss": 0.5297, + "step": 110000 + }, + { + "epoch": 2.57, + "learning_rate": 4.160345954017982e-05, + "loss": 0.5274, + "step": 110500 + }, + { + "epoch": 2.58, + "learning_rate": 4.1564570824129674e-05, + "loss": 0.5241, + "step": 111000 + }, + { + "epoch": 2.59, + "learning_rate": 4.152568210807952e-05, + "loss": 0.528, + "step": 111500 + }, + { + "epoch": 2.6, + "learning_rate": 4.148679339202937e-05, + "loss": 0.53, + "step": 112000 + }, + { + "epoch": 2.61, + "learning_rate": 4.144790467597922e-05, + "loss": 0.5291, + "step": 112500 + }, + { + "epoch": 2.63, + "learning_rate": 4.1409015959929073e-05, + "loss": 0.5431, + "step": 113000 + }, + { + "epoch": 2.64, + "learning_rate": 4.137012724387892e-05, + "loss": 0.5336, + "step": 113500 + }, + { + "epoch": 2.65, + "learning_rate": 4.133123852782877e-05, + "loss": 0.531, + "step": 114000 + }, + { + "epoch": 2.66, + "learning_rate": 4.129234981177862e-05, + "loss": 0.5278, + "step": 114500 + }, + { + "epoch": 2.67, + "learning_rate": 4.1253461095728466e-05, + "loss": 0.5214, + "step": 115000 + }, + { + "epoch": 2.68, + "learning_rate": 4.121457237967831e-05, + "loss": 0.5249, + "step": 115500 + }, + { + "epoch": 2.7, + "learning_rate": 4.117568366362816e-05, + "loss": 0.5351, + "step": 116000 + }, + { + "epoch": 2.71, + "learning_rate": 4.1136794947578014e-05, + "loss": 0.5275, + "step": 116500 + }, + { + "epoch": 2.72, + "learning_rate": 4.109790623152786e-05, + "loss": 0.5309, + "step": 117000 + }, + { + "epoch": 2.73, + "learning_rate": 4.105901751547771e-05, + "loss": 0.5291, + "step": 117500 + }, + { + "epoch": 2.74, + "learning_rate": 4.102012879942756e-05, + "loss": 0.5274, + "step": 118000 + }, + { + "epoch": 2.75, + "learning_rate": 4.098131786080951e-05, + "loss": 0.532, + "step": 118500 + }, + { + "epoch": 2.77, + "learning_rate": 4.0942429144759356e-05, + "loss": 0.5278, + "step": 119000 + }, + { + "epoch": 2.78, + "learning_rate": 4.090354042870921e-05, + "loss": 0.5338, + "step": 119500 + }, + { + "epoch": 2.79, + "learning_rate": 4.086465171265906e-05, + "loss": 0.5289, + "step": 120000 + }, + { + "epoch": 2.8, + "learning_rate": 4.082576299660891e-05, + "loss": 0.5388, + "step": 120500 + }, + { + "epoch": 2.81, + "learning_rate": 4.0786874280558755e-05, + "loss": 0.5337, + "step": 121000 + }, + { + "epoch": 2.82, + "learning_rate": 4.074798556450861e-05, + "loss": 0.5308, + "step": 121500 + }, + { + "epoch": 2.84, + "learning_rate": 4.0709174625890556e-05, + "loss": 0.5294, + "step": 122000 + }, + { + "epoch": 2.85, + "learning_rate": 4.067028590984041e-05, + "loss": 0.5385, + "step": 122500 + }, + { + "epoch": 2.86, + "learning_rate": 4.063139719379025e-05, + "loss": 0.5379, + "step": 123000 + }, + { + "epoch": 2.87, + "learning_rate": 4.05925084777401e-05, + "loss": 0.5485, + "step": 123500 + }, + { + "epoch": 2.88, + "learning_rate": 4.055361976168995e-05, + "loss": 0.5391, + "step": 124000 + }, + { + "epoch": 2.89, + "learning_rate": 4.05147310456398e-05, + "loss": 0.5414, + "step": 124500 + }, + { + "epoch": 2.91, + "learning_rate": 4.0475842329589645e-05, + "loss": 0.5237, + "step": 125000 + }, + { + "epoch": 2.92, + "learning_rate": 4.04369536135395e-05, + "loss": 0.5363, + "step": 125500 + }, + { + "epoch": 2.93, + "learning_rate": 4.039806489748935e-05, + "loss": 0.543, + "step": 126000 + }, + { + "epoch": 2.94, + "learning_rate": 4.0359331736303396e-05, + "loss": 0.5265, + "step": 126500 + }, + { + "epoch": 2.95, + "learning_rate": 4.032044302025325e-05, + "loss": 0.5379, + "step": 127000 + }, + { + "epoch": 2.96, + "learning_rate": 4.028155430420309e-05, + "loss": 0.5311, + "step": 127500 + }, + { + "epoch": 2.98, + "learning_rate": 4.0242665588152944e-05, + "loss": 0.5361, + "step": 128000 + }, + { + "epoch": 2.99, + "learning_rate": 4.0203776872102795e-05, + "loss": 0.5405, + "step": 128500 + }, + { + "epoch": 3.0, + "learning_rate": 4.016488815605265e-05, + "loss": 0.537, + "step": 129000 + }, + { + "epoch": 3.0, + "eval_bleu": 59.1835, + "eval_gen_len": 15.7226, + "eval_loss": 0.6417234539985657, + "eval_runtime": 6645.2011, + "eval_samples_per_second": 12.929, + "eval_steps_per_second": 1.616, + "step": 129072 + }, + { + "epoch": 3.01, + "learning_rate": 4.0126077217434597e-05, + "loss": 0.4217, + "step": 129500 + }, + { + "epoch": 3.02, + "learning_rate": 4.008718850138444e-05, + "loss": 0.3942, + "step": 130000 + }, + { + "epoch": 3.03, + "learning_rate": 4.0048299785334286e-05, + "loss": 0.3948, + "step": 130500 + }, + { + "epoch": 3.04, + "learning_rate": 4.0009488846716236e-05, + "loss": 0.3895, + "step": 131000 + }, + { + "epoch": 3.06, + "learning_rate": 3.9970677908098185e-05, + "loss": 0.3912, + "step": 131500 + }, + { + "epoch": 3.07, + "learning_rate": 3.993178919204804e-05, + "loss": 0.3989, + "step": 132000 + }, + { + "epoch": 3.08, + "learning_rate": 3.989290047599789e-05, + "loss": 0.3975, + "step": 132500 + }, + { + "epoch": 3.09, + "learning_rate": 3.985401175994773e-05, + "loss": 0.4036, + "step": 133000 + }, + { + "epoch": 3.1, + "learning_rate": 3.9815123043897585e-05, + "loss": 0.4028, + "step": 133500 + }, + { + "epoch": 3.11, + "learning_rate": 3.9776234327847436e-05, + "loss": 0.3985, + "step": 134000 + }, + { + "epoch": 3.13, + "learning_rate": 3.973734561179728e-05, + "loss": 0.4002, + "step": 134500 + }, + { + "epoch": 3.14, + "learning_rate": 3.969845689574713e-05, + "loss": 0.3975, + "step": 135000 + }, + { + "epoch": 3.15, + "learning_rate": 3.9659568179696984e-05, + "loss": 0.4024, + "step": 135500 + }, + { + "epoch": 3.16, + "learning_rate": 3.962067946364683e-05, + "loss": 0.4016, + "step": 136000 + }, + { + "epoch": 3.17, + "learning_rate": 3.958179074759668e-05, + "loss": 0.4084, + "step": 136500 + }, + { + "epoch": 3.18, + "learning_rate": 3.954290203154653e-05, + "loss": 0.4054, + "step": 137000 + }, + { + "epoch": 3.2, + "learning_rate": 3.9504013315496377e-05, + "loss": 0.4061, + "step": 137500 + }, + { + "epoch": 3.21, + "learning_rate": 3.946512459944623e-05, + "loss": 0.4098, + "step": 138000 + }, + { + "epoch": 3.22, + "learning_rate": 3.942623588339607e-05, + "loss": 0.4115, + "step": 138500 + }, + { + "epoch": 3.23, + "learning_rate": 3.9387347167345924e-05, + "loss": 0.4068, + "step": 139000 + }, + { + "epoch": 3.24, + "learning_rate": 3.9348536228727874e-05, + "loss": 0.4058, + "step": 139500 + }, + { + "epoch": 3.25, + "learning_rate": 3.930964751267772e-05, + "loss": 0.4059, + "step": 140000 + }, + { + "epoch": 3.27, + "learning_rate": 3.927075879662757e-05, + "loss": 0.4145, + "step": 140500 + }, + { + "epoch": 3.28, + "learning_rate": 3.9232025635441625e-05, + "loss": 0.4104, + "step": 141000 + }, + { + "epoch": 3.29, + "learning_rate": 3.919313691939147e-05, + "loss": 0.4141, + "step": 141500 + }, + { + "epoch": 3.3, + "learning_rate": 3.915424820334132e-05, + "loss": 0.4158, + "step": 142000 + }, + { + "epoch": 3.31, + "learning_rate": 3.911535948729117e-05, + "loss": 0.4115, + "step": 142500 + }, + { + "epoch": 3.32, + "learning_rate": 3.907647077124102e-05, + "loss": 0.4197, + "step": 143000 + }, + { + "epoch": 3.34, + "learning_rate": 3.903758205519087e-05, + "loss": 0.4082, + "step": 143500 + }, + { + "epoch": 3.35, + "learning_rate": 3.899869333914072e-05, + "loss": 0.4231, + "step": 144000 + }, + { + "epoch": 3.36, + "learning_rate": 3.8959804623090565e-05, + "loss": 0.4237, + "step": 144500 + }, + { + "epoch": 3.37, + "learning_rate": 3.892091590704042e-05, + "loss": 0.4162, + "step": 145000 + }, + { + "epoch": 3.38, + "learning_rate": 3.888202719099027e-05, + "loss": 0.4189, + "step": 145500 + }, + { + "epoch": 3.39, + "learning_rate": 3.884321625237221e-05, + "loss": 0.4154, + "step": 146000 + }, + { + "epoch": 3.41, + "learning_rate": 3.880432753632206e-05, + "loss": 0.4193, + "step": 146500 + }, + { + "epoch": 3.42, + "learning_rate": 3.876543882027191e-05, + "loss": 0.4238, + "step": 147000 + }, + { + "epoch": 3.43, + "learning_rate": 3.872655010422176e-05, + "loss": 0.4274, + "step": 147500 + }, + { + "epoch": 3.44, + "learning_rate": 3.868766138817161e-05, + "loss": 0.4165, + "step": 148000 + }, + { + "epoch": 3.45, + "learning_rate": 3.8648772672121455e-05, + "loss": 0.4223, + "step": 148500 + }, + { + "epoch": 3.46, + "learning_rate": 3.860988395607131e-05, + "loss": 0.4213, + "step": 149000 + }, + { + "epoch": 3.47, + "learning_rate": 3.8571073017453256e-05, + "loss": 0.422, + "step": 149500 + }, + { + "epoch": 3.49, + "learning_rate": 3.853218430140311e-05, + "loss": 0.4245, + "step": 150000 + }, + { + "epoch": 3.5, + "learning_rate": 3.849329558535296e-05, + "loss": 0.4247, + "step": 150500 + }, + { + "epoch": 3.51, + "learning_rate": 3.8454406869302804e-05, + "loss": 0.4267, + "step": 151000 + }, + { + "epoch": 3.52, + "learning_rate": 3.8415518153252656e-05, + "loss": 0.4253, + "step": 151500 + }, + { + "epoch": 3.53, + "learning_rate": 3.837662943720251e-05, + "loss": 0.4311, + "step": 152000 + }, + { + "epoch": 3.54, + "learning_rate": 3.833774072115235e-05, + "loss": 0.4204, + "step": 152500 + }, + { + "epoch": 3.56, + "learning_rate": 3.8298852005102203e-05, + "loss": 0.4246, + "step": 153000 + }, + { + "epoch": 3.57, + "learning_rate": 3.8259963289052055e-05, + "loss": 0.4424, + "step": 153500 + }, + { + "epoch": 3.58, + "learning_rate": 3.82210745730019e-05, + "loss": 0.426, + "step": 154000 + }, + { + "epoch": 3.59, + "learning_rate": 3.8182185856951744e-05, + "loss": 0.4304, + "step": 154500 + }, + { + "epoch": 3.6, + "learning_rate": 3.8143374918333694e-05, + "loss": 0.4292, + "step": 155000 + }, + { + "epoch": 3.61, + "learning_rate": 3.8104486202283546e-05, + "loss": 0.426, + "step": 155500 + }, + { + "epoch": 3.63, + "learning_rate": 3.80655974862334e-05, + "loss": 0.4247, + "step": 156000 + }, + { + "epoch": 3.64, + "learning_rate": 3.802670877018324e-05, + "loss": 0.4271, + "step": 156500 + }, + { + "epoch": 3.65, + "learning_rate": 3.798789783156519e-05, + "loss": 0.4279, + "step": 157000 + }, + { + "epoch": 3.66, + "learning_rate": 3.794900911551504e-05, + "loss": 0.4357, + "step": 157500 + }, + { + "epoch": 3.67, + "learning_rate": 3.791019817689699e-05, + "loss": 0.435, + "step": 158000 + }, + { + "epoch": 3.68, + "learning_rate": 3.7871309460846844e-05, + "loss": 0.4336, + "step": 158500 + }, + { + "epoch": 3.7, + "learning_rate": 3.7832420744796696e-05, + "loss": 0.4303, + "step": 159000 + }, + { + "epoch": 3.71, + "learning_rate": 3.779353202874654e-05, + "loss": 0.4231, + "step": 159500 + }, + { + "epoch": 3.72, + "learning_rate": 3.775464331269639e-05, + "loss": 0.4303, + "step": 160000 + }, + { + "epoch": 3.73, + "learning_rate": 3.7715754596646244e-05, + "loss": 0.4301, + "step": 160500 + }, + { + "epoch": 3.74, + "learning_rate": 3.767686588059609e-05, + "loss": 0.4317, + "step": 161000 + }, + { + "epoch": 3.75, + "learning_rate": 3.763797716454593e-05, + "loss": 0.4421, + "step": 161500 + }, + { + "epoch": 3.77, + "learning_rate": 3.7599088448495785e-05, + "loss": 0.4338, + "step": 162000 + }, + { + "epoch": 3.78, + "learning_rate": 3.7560199732445636e-05, + "loss": 0.4315, + "step": 162500 + }, + { + "epoch": 3.79, + "learning_rate": 3.752131101639548e-05, + "loss": 0.4318, + "step": 163000 + }, + { + "epoch": 3.8, + "learning_rate": 3.748242230034533e-05, + "loss": 0.4378, + "step": 163500 + }, + { + "epoch": 3.81, + "learning_rate": 3.7443533584295184e-05, + "loss": 0.438, + "step": 164000 + }, + { + "epoch": 3.82, + "learning_rate": 3.740464486824503e-05, + "loss": 0.4385, + "step": 164500 + }, + { + "epoch": 3.84, + "learning_rate": 3.736575615219488e-05, + "loss": 0.4365, + "step": 165000 + }, + { + "epoch": 3.85, + "learning_rate": 3.732686743614473e-05, + "loss": 0.4286, + "step": 165500 + }, + { + "epoch": 3.86, + "learning_rate": 3.7287978720094576e-05, + "loss": 0.4369, + "step": 166000 + }, + { + "epoch": 3.87, + "learning_rate": 3.7249167781476526e-05, + "loss": 0.4361, + "step": 166500 + }, + { + "epoch": 3.88, + "learning_rate": 3.721027906542638e-05, + "loss": 0.4371, + "step": 167000 + }, + { + "epoch": 3.89, + "learning_rate": 3.717139034937623e-05, + "loss": 0.4351, + "step": 167500 + }, + { + "epoch": 3.9, + "learning_rate": 3.713257941075818e-05, + "loss": 0.4361, + "step": 168000 + }, + { + "epoch": 3.92, + "learning_rate": 3.709376847214013e-05, + "loss": 0.434, + "step": 168500 + }, + { + "epoch": 3.93, + "learning_rate": 3.705487975608998e-05, + "loss": 0.4408, + "step": 169000 + }, + { + "epoch": 3.94, + "learning_rate": 3.701606881747192e-05, + "loss": 0.4398, + "step": 169500 + }, + { + "epoch": 3.95, + "learning_rate": 3.6977180101421774e-05, + "loss": 0.4396, + "step": 170000 + }, + { + "epoch": 3.96, + "learning_rate": 3.693829138537162e-05, + "loss": 0.438, + "step": 170500 + }, + { + "epoch": 3.97, + "learning_rate": 3.689940266932147e-05, + "loss": 0.4374, + "step": 171000 + }, + { + "epoch": 3.99, + "learning_rate": 3.686051395327132e-05, + "loss": 0.4407, + "step": 171500 + }, + { + "epoch": 4.0, + "learning_rate": 3.682162523722117e-05, + "loss": 0.434, + "step": 172000 + }, + { + "epoch": 4.0, + "eval_bleu": 59.6194, + "eval_gen_len": 15.702, + "eval_loss": 0.6589247584342957, + "eval_runtime": 6589.9687, + "eval_samples_per_second": 13.037, + "eval_steps_per_second": 1.63, + "step": 172096 + }, + { + "epoch": 4.01, + "learning_rate": 3.678273652117102e-05, + "loss": 0.3325, + "step": 172500 + }, + { + "epoch": 4.02, + "learning_rate": 3.674384780512087e-05, + "loss": 0.3007, + "step": 173000 + }, + { + "epoch": 4.03, + "learning_rate": 3.6704959089070715e-05, + "loss": 0.305, + "step": 173500 + }, + { + "epoch": 4.04, + "learning_rate": 3.6666070373020566e-05, + "loss": 0.3016, + "step": 174000 + }, + { + "epoch": 4.06, + "learning_rate": 3.662718165697042e-05, + "loss": 0.2961, + "step": 174500 + }, + { + "epoch": 4.07, + "learning_rate": 3.6588526273216564e-05, + "loss": 0.3007, + "step": 175000 + }, + { + "epoch": 4.08, + "learning_rate": 3.6549637557166415e-05, + "loss": 0.3012, + "step": 175500 + }, + { + "epoch": 4.09, + "learning_rate": 3.651074884111627e-05, + "loss": 0.3094, + "step": 176000 + }, + { + "epoch": 4.1, + "learning_rate": 3.647186012506612e-05, + "loss": 0.3069, + "step": 176500 + }, + { + "epoch": 4.11, + "learning_rate": 3.6432971409015956e-05, + "loss": 0.3091, + "step": 177000 + }, + { + "epoch": 4.13, + "learning_rate": 3.639408269296581e-05, + "loss": 0.3147, + "step": 177500 + }, + { + "epoch": 4.14, + "learning_rate": 3.635519397691566e-05, + "loss": 0.3112, + "step": 178000 + }, + { + "epoch": 4.15, + "learning_rate": 3.6316305260865504e-05, + "loss": 0.3129, + "step": 178500 + }, + { + "epoch": 4.16, + "learning_rate": 3.6277416544815356e-05, + "loss": 0.309, + "step": 179000 + }, + { + "epoch": 4.17, + "learning_rate": 3.623852782876521e-05, + "loss": 0.3195, + "step": 179500 + }, + { + "epoch": 4.18, + "learning_rate": 3.619963911271506e-05, + "loss": 0.3219, + "step": 180000 + }, + { + "epoch": 4.2, + "learning_rate": 3.6160750396664903e-05, + "loss": 0.3134, + "step": 180500 + }, + { + "epoch": 4.21, + "learning_rate": 3.6121861680614755e-05, + "loss": 0.3182, + "step": 181000 + }, + { + "epoch": 4.22, + "learning_rate": 3.6083050741996705e-05, + "loss": 0.3131, + "step": 181500 + }, + { + "epoch": 4.23, + "learning_rate": 3.6044162025946556e-05, + "loss": 0.3193, + "step": 182000 + }, + { + "epoch": 4.24, + "learning_rate": 3.6005351087328506e-05, + "loss": 0.3153, + "step": 182500 + }, + { + "epoch": 4.25, + "learning_rate": 3.596646237127835e-05, + "loss": 0.3264, + "step": 183000 + }, + { + "epoch": 4.27, + "learning_rate": 3.59276514326603e-05, + "loss": 0.3195, + "step": 183500 + }, + { + "epoch": 4.28, + "learning_rate": 3.588876271661015e-05, + "loss": 0.3197, + "step": 184000 + }, + { + "epoch": 4.29, + "learning_rate": 3.5849874000559996e-05, + "loss": 0.3235, + "step": 184500 + }, + { + "epoch": 4.3, + "learning_rate": 3.581098528450985e-05, + "loss": 0.3188, + "step": 185000 + }, + { + "epoch": 4.31, + "learning_rate": 3.577209656845969e-05, + "loss": 0.3183, + "step": 185500 + }, + { + "epoch": 4.32, + "learning_rate": 3.5733207852409544e-05, + "loss": 0.3271, + "step": 186000 + }, + { + "epoch": 4.33, + "learning_rate": 3.5694319136359396e-05, + "loss": 0.3333, + "step": 186500 + }, + { + "epoch": 4.35, + "learning_rate": 3.565543042030924e-05, + "loss": 0.3216, + "step": 187000 + }, + { + "epoch": 4.36, + "learning_rate": 3.561654170425909e-05, + "loss": 0.3296, + "step": 187500 + }, + { + "epoch": 4.37, + "learning_rate": 3.5577652988208944e-05, + "loss": 0.3282, + "step": 188000 + }, + { + "epoch": 4.38, + "learning_rate": 3.5538764272158795e-05, + "loss": 0.3297, + "step": 188500 + }, + { + "epoch": 4.39, + "learning_rate": 3.549987555610864e-05, + "loss": 0.3283, + "step": 189000 + }, + { + "epoch": 4.4, + "learning_rate": 3.546106461749059e-05, + "loss": 0.3294, + "step": 189500 + }, + { + "epoch": 4.42, + "learning_rate": 3.542217590144044e-05, + "loss": 0.3275, + "step": 190000 + }, + { + "epoch": 4.43, + "learning_rate": 3.538328718539029e-05, + "loss": 0.324, + "step": 190500 + }, + { + "epoch": 4.44, + "learning_rate": 3.534439846934014e-05, + "loss": 0.3285, + "step": 191000 + }, + { + "epoch": 4.45, + "learning_rate": 3.530558753072209e-05, + "loss": 0.3338, + "step": 191500 + }, + { + "epoch": 4.46, + "learning_rate": 3.526669881467194e-05, + "loss": 0.3347, + "step": 192000 + }, + { + "epoch": 4.47, + "learning_rate": 3.522781009862178e-05, + "loss": 0.338, + "step": 192500 + }, + { + "epoch": 4.49, + "learning_rate": 3.5188921382571635e-05, + "loss": 0.3312, + "step": 193000 + }, + { + "epoch": 4.5, + "learning_rate": 3.515003266652148e-05, + "loss": 0.3364, + "step": 193500 + }, + { + "epoch": 4.51, + "learning_rate": 3.511114395047133e-05, + "loss": 0.3347, + "step": 194000 + }, + { + "epoch": 4.52, + "learning_rate": 3.507225523442118e-05, + "loss": 0.3334, + "step": 194500 + }, + { + "epoch": 4.53, + "learning_rate": 3.503336651837103e-05, + "loss": 0.3319, + "step": 195000 + }, + { + "epoch": 4.54, + "learning_rate": 3.499455557975298e-05, + "loss": 0.3332, + "step": 195500 + }, + { + "epoch": 4.56, + "learning_rate": 3.495566686370283e-05, + "loss": 0.3418, + "step": 196000 + }, + { + "epoch": 4.57, + "learning_rate": 3.491677814765268e-05, + "loss": 0.3402, + "step": 196500 + }, + { + "epoch": 4.58, + "learning_rate": 3.487796720903463e-05, + "loss": 0.3342, + "step": 197000 + }, + { + "epoch": 4.59, + "learning_rate": 3.483907849298448e-05, + "loss": 0.3294, + "step": 197500 + }, + { + "epoch": 4.6, + "learning_rate": 3.4800189776934326e-05, + "loss": 0.3393, + "step": 198000 + }, + { + "epoch": 4.61, + "learning_rate": 3.476130106088418e-05, + "loss": 0.3446, + "step": 198500 + }, + { + "epoch": 4.63, + "learning_rate": 3.472241234483403e-05, + "loss": 0.3375, + "step": 199000 + }, + { + "epoch": 4.64, + "learning_rate": 3.4683523628783874e-05, + "loss": 0.3399, + "step": 199500 + }, + { + "epoch": 4.65, + "learning_rate": 3.4644634912733725e-05, + "loss": 0.3405, + "step": 200000 + }, + { + "epoch": 4.66, + "learning_rate": 3.460574619668357e-05, + "loss": 0.3378, + "step": 200500 + }, + { + "epoch": 4.67, + "learning_rate": 3.456693525806552e-05, + "loss": 0.346, + "step": 201000 + }, + { + "epoch": 4.68, + "learning_rate": 3.452804654201537e-05, + "loss": 0.3455, + "step": 201500 + }, + { + "epoch": 4.7, + "learning_rate": 3.4489157825965216e-05, + "loss": 0.3432, + "step": 202000 + }, + { + "epoch": 4.71, + "learning_rate": 3.4450346887347166e-05, + "loss": 0.3505, + "step": 202500 + }, + { + "epoch": 4.72, + "learning_rate": 3.441145817129702e-05, + "loss": 0.3432, + "step": 203000 + }, + { + "epoch": 4.73, + "learning_rate": 3.437256945524687e-05, + "loss": 0.3405, + "step": 203500 + }, + { + "epoch": 4.74, + "learning_rate": 3.433368073919671e-05, + "loss": 0.3394, + "step": 204000 + }, + { + "epoch": 4.75, + "learning_rate": 3.4294792023146565e-05, + "loss": 0.3477, + "step": 204500 + }, + { + "epoch": 4.76, + "learning_rate": 3.4255903307096416e-05, + "loss": 0.352, + "step": 205000 + }, + { + "epoch": 4.78, + "learning_rate": 3.421701459104627e-05, + "loss": 0.3442, + "step": 205500 + }, + { + "epoch": 4.79, + "learning_rate": 3.417812587499611e-05, + "loss": 0.3487, + "step": 206000 + }, + { + "epoch": 4.8, + "learning_rate": 3.4139237158945964e-05, + "loss": 0.346, + "step": 206500 + }, + { + "epoch": 4.81, + "learning_rate": 3.4100348442895816e-05, + "loss": 0.3533, + "step": 207000 + }, + { + "epoch": 4.82, + "learning_rate": 3.406145972684566e-05, + "loss": 0.3501, + "step": 207500 + }, + { + "epoch": 4.83, + "learning_rate": 3.4022571010795505e-05, + "loss": 0.3472, + "step": 208000 + }, + { + "epoch": 4.85, + "learning_rate": 3.398368229474536e-05, + "loss": 0.351, + "step": 208500 + }, + { + "epoch": 4.86, + "learning_rate": 3.3944871356127306e-05, + "loss": 0.3415, + "step": 209000 + }, + { + "epoch": 4.87, + "learning_rate": 3.390598264007716e-05, + "loss": 0.3478, + "step": 209500 + }, + { + "epoch": 4.88, + "learning_rate": 3.386717170145911e-05, + "loss": 0.3475, + "step": 210000 + }, + { + "epoch": 4.89, + "learning_rate": 3.382828298540895e-05, + "loss": 0.3486, + "step": 210500 + }, + { + "epoch": 4.9, + "learning_rate": 3.3789394269358804e-05, + "loss": 0.3564, + "step": 211000 + }, + { + "epoch": 4.92, + "learning_rate": 3.3750505553308655e-05, + "loss": 0.3484, + "step": 211500 + }, + { + "epoch": 4.93, + "learning_rate": 3.37116168372585e-05, + "loss": 0.3541, + "step": 212000 + }, + { + "epoch": 4.94, + "learning_rate": 3.367280589864045e-05, + "loss": 0.3464, + "step": 212500 + }, + { + "epoch": 4.95, + "learning_rate": 3.36339949600224e-05, + "loss": 0.35, + "step": 213000 + }, + { + "epoch": 4.96, + "learning_rate": 3.359510624397225e-05, + "loss": 0.3475, + "step": 213500 + }, + { + "epoch": 4.97, + "learning_rate": 3.35562175279221e-05, + "loss": 0.3479, + "step": 214000 + }, + { + "epoch": 4.99, + "learning_rate": 3.3517328811871954e-05, + "loss": 0.3556, + "step": 214500 + }, + { + "epoch": 5.0, + "learning_rate": 3.34784400958218e-05, + "loss": 0.3504, + "step": 215000 + }, + { + "epoch": 5.0, + "eval_bleu": 59.352, + "eval_gen_len": 15.7454, + "eval_loss": 0.7117229700088501, + "eval_runtime": 6592.0433, + "eval_samples_per_second": 13.033, + "eval_steps_per_second": 1.629, + "step": 215120 + }, + { + "epoch": 5.01, + "learning_rate": 3.343955137977165e-05, + "loss": 0.2566, + "step": 215500 + }, + { + "epoch": 5.02, + "learning_rate": 3.3400662663721495e-05, + "loss": 0.2267, + "step": 216000 + }, + { + "epoch": 5.03, + "learning_rate": 3.336177394767134e-05, + "loss": 0.2283, + "step": 216500 + }, + { + "epoch": 5.04, + "learning_rate": 3.332288523162119e-05, + "loss": 0.226, + "step": 217000 + }, + { + "epoch": 5.06, + "learning_rate": 3.328399651557104e-05, + "loss": 0.2285, + "step": 217500 + }, + { + "epoch": 5.07, + "learning_rate": 3.3245107799520894e-05, + "loss": 0.2307, + "step": 218000 + }, + { + "epoch": 5.08, + "learning_rate": 3.320621908347074e-05, + "loss": 0.2282, + "step": 218500 + }, + { + "epoch": 5.09, + "learning_rate": 3.316733036742059e-05, + "loss": 0.2302, + "step": 219000 + }, + { + "epoch": 5.1, + "learning_rate": 3.312851942880254e-05, + "loss": 0.2312, + "step": 219500 + }, + { + "epoch": 5.11, + "learning_rate": 3.308970849018449e-05, + "loss": 0.2318, + "step": 220000 + }, + { + "epoch": 5.13, + "learning_rate": 3.305089755156644e-05, + "loss": 0.2389, + "step": 220500 + }, + { + "epoch": 5.14, + "learning_rate": 3.301200883551629e-05, + "loss": 0.2371, + "step": 221000 + }, + { + "epoch": 5.15, + "learning_rate": 3.2973120119466136e-05, + "loss": 0.2396, + "step": 221500 + }, + { + "epoch": 5.16, + "learning_rate": 3.293423140341599e-05, + "loss": 0.2356, + "step": 222000 + }, + { + "epoch": 5.17, + "learning_rate": 3.289534268736584e-05, + "loss": 0.236, + "step": 222500 + }, + { + "epoch": 5.18, + "learning_rate": 3.285645397131569e-05, + "loss": 0.2381, + "step": 223000 + }, + { + "epoch": 5.19, + "learning_rate": 3.281756525526553e-05, + "loss": 0.2396, + "step": 223500 + }, + { + "epoch": 5.21, + "learning_rate": 3.277875431664748e-05, + "loss": 0.2361, + "step": 224000 + }, + { + "epoch": 5.22, + "learning_rate": 3.273986560059733e-05, + "loss": 0.2423, + "step": 224500 + }, + { + "epoch": 5.23, + "learning_rate": 3.270097688454718e-05, + "loss": 0.2407, + "step": 225000 + }, + { + "epoch": 5.24, + "learning_rate": 3.266216594592913e-05, + "loss": 0.2427, + "step": 225500 + }, + { + "epoch": 5.25, + "learning_rate": 3.262327722987898e-05, + "loss": 0.2454, + "step": 226000 + }, + { + "epoch": 5.26, + "learning_rate": 3.258438851382883e-05, + "loss": 0.2484, + "step": 226500 + }, + { + "epoch": 5.28, + "learning_rate": 3.254549979777868e-05, + "loss": 0.2467, + "step": 227000 + }, + { + "epoch": 5.29, + "learning_rate": 3.250661108172853e-05, + "loss": 0.2439, + "step": 227500 + }, + { + "epoch": 5.3, + "learning_rate": 3.2467722365678375e-05, + "loss": 0.2496, + "step": 228000 + }, + { + "epoch": 5.31, + "learning_rate": 3.2428833649628226e-05, + "loss": 0.2486, + "step": 228500 + }, + { + "epoch": 5.32, + "learning_rate": 3.238994493357808e-05, + "loss": 0.2466, + "step": 229000 + }, + { + "epoch": 5.33, + "learning_rate": 3.235105621752792e-05, + "loss": 0.2497, + "step": 229500 + }, + { + "epoch": 5.35, + "learning_rate": 3.2312167501477774e-05, + "loss": 0.2455, + "step": 230000 + }, + { + "epoch": 5.36, + "learning_rate": 3.2273278785427626e-05, + "loss": 0.2532, + "step": 230500 + }, + { + "epoch": 5.37, + "learning_rate": 3.223439006937747e-05, + "loss": 0.2454, + "step": 231000 + }, + { + "epoch": 5.38, + "learning_rate": 3.219557913075942e-05, + "loss": 0.2491, + "step": 231500 + }, + { + "epoch": 5.39, + "learning_rate": 3.2156690414709265e-05, + "loss": 0.2501, + "step": 232000 + }, + { + "epoch": 5.4, + "learning_rate": 3.2117801698659116e-05, + "loss": 0.2528, + "step": 232500 + }, + { + "epoch": 5.42, + "learning_rate": 3.207891298260897e-05, + "loss": 0.2482, + "step": 233000 + }, + { + "epoch": 5.43, + "learning_rate": 3.204002426655881e-05, + "loss": 0.2548, + "step": 233500 + }, + { + "epoch": 5.44, + "learning_rate": 3.200121332794076e-05, + "loss": 0.2565, + "step": 234000 + }, + { + "epoch": 5.45, + "learning_rate": 3.1962324611890614e-05, + "loss": 0.253, + "step": 234500 + }, + { + "epoch": 5.46, + "learning_rate": 3.1923513673272564e-05, + "loss": 0.251, + "step": 235000 + }, + { + "epoch": 5.47, + "learning_rate": 3.1884624957222415e-05, + "loss": 0.2542, + "step": 235500 + }, + { + "epoch": 5.49, + "learning_rate": 3.1845736241172267e-05, + "loss": 0.2611, + "step": 236000 + }, + { + "epoch": 5.5, + "learning_rate": 3.180684752512211e-05, + "loss": 0.2583, + "step": 236500 + }, + { + "epoch": 5.51, + "learning_rate": 3.176795880907196e-05, + "loss": 0.2577, + "step": 237000 + }, + { + "epoch": 5.52, + "learning_rate": 3.1729070093021814e-05, + "loss": 0.2555, + "step": 237500 + }, + { + "epoch": 5.53, + "learning_rate": 3.169018137697166e-05, + "loss": 0.2581, + "step": 238000 + }, + { + "epoch": 5.54, + "learning_rate": 3.165129266092151e-05, + "loss": 0.2525, + "step": 238500 + }, + { + "epoch": 5.56, + "learning_rate": 3.1612403944871355e-05, + "loss": 0.2607, + "step": 239000 + }, + { + "epoch": 5.57, + "learning_rate": 3.157351522882121e-05, + "loss": 0.2623, + "step": 239500 + }, + { + "epoch": 5.58, + "learning_rate": 3.153462651277105e-05, + "loss": 0.2644, + "step": 240000 + }, + { + "epoch": 5.59, + "learning_rate": 3.14957377967209e-05, + "loss": 0.2635, + "step": 240500 + }, + { + "epoch": 5.6, + "learning_rate": 3.1456849080670755e-05, + "loss": 0.2644, + "step": 241000 + }, + { + "epoch": 5.61, + "learning_rate": 3.14179603646206e-05, + "loss": 0.2586, + "step": 241500 + }, + { + "epoch": 5.62, + "learning_rate": 3.137907164857045e-05, + "loss": 0.2688, + "step": 242000 + }, + { + "epoch": 5.64, + "learning_rate": 3.13401829325203e-05, + "loss": 0.2592, + "step": 242500 + }, + { + "epoch": 5.65, + "learning_rate": 3.1301294216470154e-05, + "loss": 0.2627, + "step": 243000 + }, + { + "epoch": 5.66, + "learning_rate": 3.12626388327163e-05, + "loss": 0.2659, + "step": 243500 + }, + { + "epoch": 5.67, + "learning_rate": 3.122375011666615e-05, + "loss": 0.2631, + "step": 244000 + }, + { + "epoch": 5.68, + "learning_rate": 3.1184861400616e-05, + "loss": 0.2647, + "step": 244500 + }, + { + "epoch": 5.69, + "learning_rate": 3.114597268456585e-05, + "loss": 0.2662, + "step": 245000 + }, + { + "epoch": 5.71, + "learning_rate": 3.11070839685157e-05, + "loss": 0.2647, + "step": 245500 + }, + { + "epoch": 5.72, + "learning_rate": 3.106819525246555e-05, + "loss": 0.2735, + "step": 246000 + }, + { + "epoch": 5.73, + "learning_rate": 3.10293843138475e-05, + "loss": 0.27, + "step": 246500 + }, + { + "epoch": 5.74, + "learning_rate": 3.0990495597797345e-05, + "loss": 0.2696, + "step": 247000 + }, + { + "epoch": 5.75, + "learning_rate": 3.095160688174719e-05, + "loss": 0.2652, + "step": 247500 + }, + { + "epoch": 5.76, + "learning_rate": 3.091279594312914e-05, + "loss": 0.2704, + "step": 248000 + }, + { + "epoch": 5.78, + "learning_rate": 3.087390722707899e-05, + "loss": 0.2633, + "step": 248500 + }, + { + "epoch": 5.79, + "learning_rate": 3.083501851102884e-05, + "loss": 0.2696, + "step": 249000 + }, + { + "epoch": 5.8, + "learning_rate": 3.079612979497869e-05, + "loss": 0.2657, + "step": 249500 + }, + { + "epoch": 5.81, + "learning_rate": 3.075724107892854e-05, + "loss": 0.2705, + "step": 250000 + }, + { + "epoch": 5.82, + "learning_rate": 3.071835236287839e-05, + "loss": 0.2727, + "step": 250500 + }, + { + "epoch": 5.83, + "learning_rate": 3.0679463646828235e-05, + "loss": 0.2636, + "step": 251000 + }, + { + "epoch": 5.85, + "learning_rate": 3.064057493077809e-05, + "loss": 0.2772, + "step": 251500 + }, + { + "epoch": 5.86, + "learning_rate": 3.060168621472794e-05, + "loss": 0.2658, + "step": 252000 + }, + { + "epoch": 5.87, + "learning_rate": 3.056279749867779e-05, + "loss": 0.2744, + "step": 252500 + }, + { + "epoch": 5.88, + "learning_rate": 3.0523908782627634e-05, + "loss": 0.2711, + "step": 253000 + }, + { + "epoch": 5.89, + "learning_rate": 3.0485020066577486e-05, + "loss": 0.2668, + "step": 253500 + }, + { + "epoch": 5.9, + "learning_rate": 3.0446131350527334e-05, + "loss": 0.2706, + "step": 254000 + }, + { + "epoch": 5.92, + "learning_rate": 3.040724263447718e-05, + "loss": 0.2698, + "step": 254500 + }, + { + "epoch": 5.93, + "learning_rate": 3.0368353918427027e-05, + "loss": 0.2714, + "step": 255000 + }, + { + "epoch": 5.94, + "learning_rate": 3.0329542979808977e-05, + "loss": 0.2734, + "step": 255500 + }, + { + "epoch": 5.95, + "learning_rate": 3.0290654263758828e-05, + "loss": 0.2696, + "step": 256000 + }, + { + "epoch": 5.96, + "learning_rate": 3.0251765547708676e-05, + "loss": 0.2752, + "step": 256500 + }, + { + "epoch": 5.97, + "learning_rate": 3.0212876831658528e-05, + "loss": 0.2732, + "step": 257000 + }, + { + "epoch": 5.99, + "learning_rate": 3.0173988115608376e-05, + "loss": 0.2778, + "step": 257500 + }, + { + "epoch": 6.0, + "learning_rate": 3.0135177176990326e-05, + "loss": 0.2799, + "step": 258000 + }, + { + "epoch": 6.0, + "eval_bleu": 59.2034, + "eval_gen_len": 15.6702, + "eval_loss": 0.7783872485160828, + "eval_runtime": 6542.9489, + "eval_samples_per_second": 13.131, + "eval_steps_per_second": 1.641, + "step": 258144 + } + ], + "logging_steps": 500, + "max_steps": 645360, + "num_input_tokens_seen": 0, + "num_train_epochs": 15, + "save_steps": 500, + "total_flos": 8.950904749025133e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-258144/training_args.bin b/checkpoint-258144/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..c71a9dade62e1afff1c0282f02a162933d87afd7 --- /dev/null +++ b/checkpoint-258144/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64d80f6465473a02dd91018cea5b0675845684097cfa8269a7e746c79607dbe1 +size 4856 diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..77b8612fc6bf9e6aaa7560c8bcce3e4c9c6986de --- /dev/null +++ b/config.json @@ -0,0 +1,59 @@ +{ + "_name_or_path": "facebook/mbart-large-50-many-to-many-mmt", + "_num_labels": 3, + "activation_dropout": 0.0, + "activation_function": "relu", + "add_bias_logits": false, + "add_final_layer_norm": true, + "architectures": [ + "MBartForConditionalGeneration" + ], + "attention_dropout": 0.0, + "bos_token_id": 0, + "classif_dropout": 0.0, + "classifier_dropout": 0.0, + "d_model": 1024, + "decoder_attention_heads": 16, + "decoder_ffn_dim": 4096, + "decoder_layerdrop": 0.0, + "decoder_layers": 12, + "decoder_start_token_id": 2, + "dropout": 0.1, + "early_stopping": true, + "encoder_attention_heads": 16, + "encoder_ffn_dim": 4096, + "encoder_layerdrop": 0.0, + "encoder_layers": 12, + "eos_token_id": 2, + "forced_bos_token_id": 250014, + "forced_eos_token_id": 2, + "gradient_checkpointing": false, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1", + "2": "LABEL_2" + }, + "init_std": 0.02, + "is_encoder_decoder": true, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1, + "LABEL_2": 2 + }, + "max_length": 200, + "max_position_embeddings": 1024, + "model_type": "mbart", + "normalize_before": true, + "normalize_embedding": true, + "num_beams": 5, + "num_hidden_layers": 12, + "output_past": true, + "pad_token_id": 1, + "scale_embedding": true, + "static_position_embeddings": false, + "tokenizer_class": "MBart50Tokenizer", + "torch_dtype": "float32", + "transformers_version": "4.37.2", + "use_cache": true, + "vocab_size": 250054 +} diff --git a/eval_results.json b/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..e4df94376d5a731ab535b54fe2689b8fff64d0e2 --- /dev/null +++ b/eval_results.json @@ -0,0 +1,10 @@ +{ + "epoch": 6.0, + "eval_bleu": 59.1835, + "eval_gen_len": 15.7226, + "eval_loss": 0.6417234539985657, + "eval_runtime": 6644.884, + "eval_samples": 85914, + "eval_samples_per_second": 12.929, + "eval_steps_per_second": 1.616 +} \ No newline at end of file diff --git a/generated_predictions.txt b/generated_predictions.txt new file mode 100644 index 0000000000000000000000000000000000000000..d4118d0e412b29c4c38468bb4f6bf640d406863c --- /dev/null +++ b/generated_predictions.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6eb3f34b1e7d070e98affb0c49e794102237467f7e1646043f8aa38d91ddeca2 +size 11637304 diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b1d0b25bc2cc841a32d579c94d80eb6323756b7a --- /dev/null +++ b/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 0, + "decoder_start_token_id": 2, + "early_stopping": true, + "eos_token_id": 2, + "forced_bos_token_id": 250014, + "forced_eos_token_id": 2, + "max_length": 200, + "num_beams": 5, + "pad_token_id": 1, + "transformers_version": "4.37.2" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4de350a5baef79d3b3285908a7478968aa943d99 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce8ee05becd6c68dae4da8f2188129cca72d4a29e3fc1ff98ff891ac907f3f7b +size 2444578688 diff --git a/predict_results.json b/predict_results.json new file mode 100644 index 0000000000000000000000000000000000000000..55003ebe6466246dd68733da08710481f9dc67ef --- /dev/null +++ b/predict_results.json @@ -0,0 +1,9 @@ +{ + "predict_bleu": 59.4091, + "predict_gen_len": 16.1979, + "predict_loss": 0.6332442164421082, + "predict_runtime": 7442.4048, + "predict_samples": 94835, + "predict_samples_per_second": 12.743, + "predict_steps_per_second": 1.593 +} \ No newline at end of file diff --git a/sentencepiece.bpe.model b/sentencepiece.bpe.model new file mode 100644 index 0000000000000000000000000000000000000000..7a3f40a75f870bc1f21700cd414dc2acc431583c --- /dev/null +++ b/sentencepiece.bpe.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865 +size 5069051 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..92619141640d5fcbb4429807de2248352b0dca79 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,69 @@ +{ + "additional_special_tokens": [ + "ar_AR", + "cs_CZ", + "de_DE", + "en_XX", + "es_XX", + "et_EE", + "fi_FI", + "fr_XX", + "gu_IN", + "hi_IN", + "it_IT", + "ja_XX", + "kk_KZ", + "ko_KR", + "lt_LT", + "lv_LV", + "my_MM", + "ne_NP", + "nl_XX", + "ro_RO", + "ru_RU", + "si_LK", + "tr_TR", + "vi_VN", + "zh_CN", + "af_ZA", + "az_AZ", + "bn_IN", + "fa_IR", + "he_IL", + "hr_HR", + "id_ID", + "ka_GE", + "km_KH", + "mk_MK", + "ml_IN", + "mn_MN", + "mr_IN", + "pl_PL", + "ps_AF", + "pt_XX", + "sv_SE", + "sw_KE", + "ta_IN", + "te_IN", + "th_TH", + "tl_XX", + "uk_UA", + "ur_PK", + "xh_ZA", + "gl_ES", + "sl_SI" + ], + "bos_token": "", + "cls_token": "", + "eos_token": "", + "mask_token": { + "content": "", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "sep_token": "", + "unk_token": "" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..ecc6a4f3075bc2a01607c72e81fd24456ab68311 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfb9b1f3e7ce9f6c1a5ab4560578eda3329db396be400909c5d34c8d0b08b0ed +size 17110208 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..70c0515a5815fcc727e11e053116348bfac12128 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,528 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250001": { + "content": "ar_AR", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250002": { + "content": "cs_CZ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250003": { + "content": "de_DE", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250004": { + "content": "en_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250005": { + "content": "es_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250006": { + "content": "et_EE", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250007": { + "content": "fi_FI", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250008": { + "content": "fr_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250009": { + "content": "gu_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250010": { + "content": "hi_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250011": { + "content": "it_IT", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250012": { + "content": "ja_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250013": { + "content": "kk_KZ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250014": { + "content": "ko_KR", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250015": { + "content": "lt_LT", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250016": { + "content": "lv_LV", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250017": { + "content": "my_MM", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250018": { + "content": "ne_NP", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250019": { + "content": "nl_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250020": { + "content": "ro_RO", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250021": { + "content": "ru_RU", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250022": { + "content": "si_LK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250023": { + "content": "tr_TR", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250024": { + "content": "vi_VN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250025": { + "content": "zh_CN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250026": { + "content": "af_ZA", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250027": { + "content": "az_AZ", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250028": { + "content": "bn_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250029": { + "content": "fa_IR", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250030": { + "content": "he_IL", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250031": { + "content": "hr_HR", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250032": { + "content": "id_ID", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250033": { + "content": "ka_GE", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250034": { + "content": "km_KH", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250035": { + "content": "mk_MK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250036": { + "content": "ml_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250037": { + "content": "mn_MN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250038": { + "content": "mr_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250039": { + "content": "pl_PL", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250040": { + "content": "ps_AF", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250041": { + "content": "pt_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250042": { + "content": "sv_SE", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250043": { + "content": "sw_KE", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250044": { + "content": "ta_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250045": { + "content": "te_IN", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250046": { + "content": "th_TH", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250047": { + "content": "tl_XX", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250048": { + "content": "uk_UA", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250049": { + "content": "ur_PK", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250050": { + "content": "xh_ZA", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250051": { + "content": "gl_ES", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250052": { + "content": "sl_SI", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250053": { + "content": "", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "ar_AR", + "cs_CZ", + "de_DE", + "en_XX", + "es_XX", + "et_EE", + "fi_FI", + "fr_XX", + "gu_IN", + "hi_IN", + "it_IT", + "ja_XX", + "kk_KZ", + "ko_KR", + "lt_LT", + "lv_LV", + "my_MM", + "ne_NP", + "nl_XX", + "ro_RO", + "ru_RU", + "si_LK", + "tr_TR", + "vi_VN", + "zh_CN", + "af_ZA", + "az_AZ", + "bn_IN", + "fa_IR", + "he_IL", + "hr_HR", + "id_ID", + "ka_GE", + "km_KH", + "mk_MK", + "ml_IN", + "mn_MN", + "mr_IN", + "pl_PL", + "ps_AF", + "pt_XX", + "sv_SE", + "sw_KE", + "ta_IN", + "te_IN", + "th_TH", + "tl_XX", + "uk_UA", + "ur_PK", + "xh_ZA", + "gl_ES", + "sl_SI" + ], + "bos_token": "", + "clean_up_tokenization_spaces": true, + "cls_token": "", + "eos_token": "", + "language_codes": "ML50", + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sep_token": "", + "sp_model_kwargs": {}, + "src_lang": "ja_XX", + "tgt_lang": "ko_KR", + "tokenizer_class": "MBart50Tokenizer", + "unk_token": "" +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000000000000000000000000000000000000..afb1499f9b7a0cad81870cd1885982f64c5d7194 --- /dev/null +++ b/train_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 6.0, + "train_loss": 0.5174947596547979, + "train_runtime": 240022.8696, + "train_samples": 688378, + "train_samples_per_second": 43.02, + "train_steps_per_second": 2.689 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f866739ce0f0bb639146978d013046c799577ea9 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,3186 @@ +{ + "best_metric": 0.6417234539985657, + "best_model_checkpoint": "./enko_mbartLarge_100p_sup2/checkpoint-129072", + "epoch": 6.0, + "eval_steps": 500, + "global_step": 258144, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 9.940000000000001e-06, + "loss": 2.2901, + "step": 500 + }, + { + "epoch": 0.02, + "learning_rate": 1.992e-05, + "loss": 1.6084, + "step": 1000 + }, + { + "epoch": 0.03, + "learning_rate": 2.9920000000000005e-05, + "loss": 1.449, + "step": 1500 + }, + { + "epoch": 0.05, + "learning_rate": 3.9920000000000004e-05, + "loss": 1.3874, + "step": 2000 + }, + { + "epoch": 0.06, + "learning_rate": 4.992e-05, + "loss": 1.3485, + "step": 2500 + }, + { + "epoch": 0.07, + "learning_rate": 4.996142239367825e-05, + "loss": 1.2926, + "step": 3000 + }, + { + "epoch": 0.08, + "learning_rate": 4.9922533677628105e-05, + "loss": 1.248, + "step": 3500 + }, + { + "epoch": 0.09, + "learning_rate": 4.988364496157795e-05, + "loss": 1.1931, + "step": 4000 + }, + { + "epoch": 0.1, + "learning_rate": 4.98447562455278e-05, + "loss": 1.1677, + "step": 4500 + }, + { + "epoch": 0.12, + "learning_rate": 4.980594530690975e-05, + "loss": 1.142, + "step": 5000 + }, + { + "epoch": 0.13, + "learning_rate": 4.97670565908596e-05, + "loss": 1.122, + "step": 5500 + }, + { + "epoch": 0.14, + "learning_rate": 4.972816787480945e-05, + "loss": 1.0855, + "step": 6000 + }, + { + "epoch": 0.15, + "learning_rate": 4.968927915875929e-05, + "loss": 1.0719, + "step": 6500 + }, + { + "epoch": 0.16, + "learning_rate": 4.965039044270914e-05, + "loss": 1.053, + "step": 7000 + }, + { + "epoch": 0.17, + "learning_rate": 4.9611501726658995e-05, + "loss": 1.0359, + "step": 7500 + }, + { + "epoch": 0.19, + "learning_rate": 4.957261301060884e-05, + "loss": 1.0232, + "step": 8000 + }, + { + "epoch": 0.2, + "learning_rate": 4.953380207199079e-05, + "loss": 1.0169, + "step": 8500 + }, + { + "epoch": 0.21, + "learning_rate": 4.949491335594064e-05, + "loss": 1.0141, + "step": 9000 + }, + { + "epoch": 0.22, + "learning_rate": 4.945602463989049e-05, + "loss": 0.9855, + "step": 9500 + }, + { + "epoch": 0.23, + "learning_rate": 4.941721370127244e-05, + "loss": 0.9932, + "step": 10000 + }, + { + "epoch": 0.24, + "learning_rate": 4.9378324985222293e-05, + "loss": 0.9656, + "step": 10500 + }, + { + "epoch": 0.26, + "learning_rate": 4.933943626917214e-05, + "loss": 0.9694, + "step": 11000 + }, + { + "epoch": 0.27, + "learning_rate": 4.930054755312199e-05, + "loss": 0.9609, + "step": 11500 + }, + { + "epoch": 0.28, + "learning_rate": 4.926165883707184e-05, + "loss": 0.9526, + "step": 12000 + }, + { + "epoch": 0.29, + "learning_rate": 4.9222770121021686e-05, + "loss": 0.9359, + "step": 12500 + }, + { + "epoch": 0.3, + "learning_rate": 4.9183959182403636e-05, + "loss": 0.9324, + "step": 13000 + }, + { + "epoch": 0.31, + "learning_rate": 4.914507046635349e-05, + "loss": 0.9251, + "step": 13500 + }, + { + "epoch": 0.33, + "learning_rate": 4.910625952773543e-05, + "loss": 0.9216, + "step": 14000 + }, + { + "epoch": 0.34, + "learning_rate": 4.906737081168528e-05, + "loss": 0.9181, + "step": 14500 + }, + { + "epoch": 0.35, + "learning_rate": 4.902855987306723e-05, + "loss": 0.9098, + "step": 15000 + }, + { + "epoch": 0.36, + "learning_rate": 4.898967115701708e-05, + "loss": 0.9067, + "step": 15500 + }, + { + "epoch": 0.37, + "learning_rate": 4.895078244096693e-05, + "loss": 0.8931, + "step": 16000 + }, + { + "epoch": 0.38, + "learning_rate": 4.891189372491678e-05, + "loss": 0.8923, + "step": 16500 + }, + { + "epoch": 0.4, + "learning_rate": 4.887300500886663e-05, + "loss": 0.8703, + "step": 17000 + }, + { + "epoch": 0.41, + "learning_rate": 4.8834116292816475e-05, + "loss": 0.8861, + "step": 17500 + }, + { + "epoch": 0.42, + "learning_rate": 4.879522757676633e-05, + "loss": 0.8864, + "step": 18000 + }, + { + "epoch": 0.43, + "learning_rate": 4.875633886071618e-05, + "loss": 0.886, + "step": 18500 + }, + { + "epoch": 0.44, + "learning_rate": 4.871752792209813e-05, + "loss": 0.8737, + "step": 19000 + }, + { + "epoch": 0.45, + "learning_rate": 4.867863920604798e-05, + "loss": 0.8708, + "step": 19500 + }, + { + "epoch": 0.46, + "learning_rate": 4.8639750489997824e-05, + "loss": 0.865, + "step": 20000 + }, + { + "epoch": 0.48, + "learning_rate": 4.8600861773947676e-05, + "loss": 0.8571, + "step": 20500 + }, + { + "epoch": 0.49, + "learning_rate": 4.856197305789753e-05, + "loss": 0.8621, + "step": 21000 + }, + { + "epoch": 0.5, + "learning_rate": 4.852308434184737e-05, + "loss": 0.8607, + "step": 21500 + }, + { + "epoch": 0.51, + "learning_rate": 4.848419562579722e-05, + "loss": 0.8519, + "step": 22000 + }, + { + "epoch": 0.52, + "learning_rate": 4.844530690974707e-05, + "loss": 0.8472, + "step": 22500 + }, + { + "epoch": 0.53, + "learning_rate": 4.840641819369692e-05, + "loss": 0.8381, + "step": 23000 + }, + { + "epoch": 0.55, + "learning_rate": 4.8367529477646765e-05, + "loss": 0.8329, + "step": 23500 + }, + { + "epoch": 0.56, + "learning_rate": 4.8328640761596616e-05, + "loss": 0.8425, + "step": 24000 + }, + { + "epoch": 0.57, + "learning_rate": 4.828975204554647e-05, + "loss": 0.8227, + "step": 24500 + }, + { + "epoch": 0.58, + "learning_rate": 4.825086332949631e-05, + "loss": 0.8349, + "step": 25000 + }, + { + "epoch": 0.59, + "learning_rate": 4.8211974613446164e-05, + "loss": 0.8235, + "step": 25500 + }, + { + "epoch": 0.6, + "learning_rate": 4.8173085897396015e-05, + "loss": 0.8116, + "step": 26000 + }, + { + "epoch": 0.62, + "learning_rate": 4.813419718134587e-05, + "loss": 0.8367, + "step": 26500 + }, + { + "epoch": 0.63, + "learning_rate": 4.809530846529571e-05, + "loss": 0.8219, + "step": 27000 + }, + { + "epoch": 0.64, + "learning_rate": 4.805641974924556e-05, + "loss": 0.8206, + "step": 27500 + }, + { + "epoch": 0.65, + "learning_rate": 4.8017531033195415e-05, + "loss": 0.8199, + "step": 28000 + }, + { + "epoch": 0.66, + "learning_rate": 4.797864231714526e-05, + "loss": 0.816, + "step": 28500 + }, + { + "epoch": 0.67, + "learning_rate": 4.793983137852721e-05, + "loss": 0.8144, + "step": 29000 + }, + { + "epoch": 0.69, + "learning_rate": 4.790102043990916e-05, + "loss": 0.806, + "step": 29500 + }, + { + "epoch": 0.7, + "learning_rate": 4.786220950129111e-05, + "loss": 0.8134, + "step": 30000 + }, + { + "epoch": 0.71, + "learning_rate": 4.782332078524095e-05, + "loss": 0.8109, + "step": 30500 + }, + { + "epoch": 0.72, + "learning_rate": 4.7784432069190805e-05, + "loss": 0.792, + "step": 31000 + }, + { + "epoch": 0.73, + "learning_rate": 4.7745543353140656e-05, + "loss": 0.8046, + "step": 31500 + }, + { + "epoch": 0.74, + "learning_rate": 4.77066546370905e-05, + "loss": 0.7983, + "step": 32000 + }, + { + "epoch": 0.76, + "learning_rate": 4.766776592104035e-05, + "loss": 0.7937, + "step": 32500 + }, + { + "epoch": 0.77, + "learning_rate": 4.7628877204990204e-05, + "loss": 0.7978, + "step": 33000 + }, + { + "epoch": 0.78, + "learning_rate": 4.758998848894005e-05, + "loss": 0.7961, + "step": 33500 + }, + { + "epoch": 0.79, + "learning_rate": 4.75510997728899e-05, + "loss": 0.7854, + "step": 34000 + }, + { + "epoch": 0.8, + "learning_rate": 4.751221105683975e-05, + "loss": 0.8006, + "step": 34500 + }, + { + "epoch": 0.81, + "learning_rate": 4.7473322340789597e-05, + "loss": 0.7857, + "step": 35000 + }, + { + "epoch": 0.83, + "learning_rate": 4.743443362473945e-05, + "loss": 0.7857, + "step": 35500 + }, + { + "epoch": 0.84, + "learning_rate": 4.73956226861214e-05, + "loss": 0.7874, + "step": 36000 + }, + { + "epoch": 0.85, + "learning_rate": 4.735673397007125e-05, + "loss": 0.7826, + "step": 36500 + }, + { + "epoch": 0.86, + "learning_rate": 4.73178452540211e-05, + "loss": 0.7902, + "step": 37000 + }, + { + "epoch": 0.87, + "learning_rate": 4.727895653797094e-05, + "loss": 0.7695, + "step": 37500 + }, + { + "epoch": 0.88, + "learning_rate": 4.724006782192079e-05, + "loss": 0.7789, + "step": 38000 + }, + { + "epoch": 0.89, + "learning_rate": 4.720117910587064e-05, + "loss": 0.7739, + "step": 38500 + }, + { + "epoch": 0.91, + "learning_rate": 4.716229038982049e-05, + "loss": 0.7726, + "step": 39000 + }, + { + "epoch": 0.92, + "learning_rate": 4.712340167377034e-05, + "loss": 0.7722, + "step": 39500 + }, + { + "epoch": 0.93, + "learning_rate": 4.708451295772019e-05, + "loss": 0.7656, + "step": 40000 + }, + { + "epoch": 0.94, + "learning_rate": 4.704562424167004e-05, + "loss": 0.77, + "step": 40500 + }, + { + "epoch": 0.95, + "learning_rate": 4.700689108048409e-05, + "loss": 0.7814, + "step": 41000 + }, + { + "epoch": 0.96, + "learning_rate": 4.696800236443394e-05, + "loss": 0.7704, + "step": 41500 + }, + { + "epoch": 0.98, + "learning_rate": 4.692919142581589e-05, + "loss": 0.7611, + "step": 42000 + }, + { + "epoch": 0.99, + "learning_rate": 4.689038048719784e-05, + "loss": 0.7629, + "step": 42500 + }, + { + "epoch": 1.0, + "learning_rate": 4.6851491771147685e-05, + "loss": 0.7676, + "step": 43000 + }, + { + "epoch": 1.0, + "eval_bleu": 55.2526, + "eval_gen_len": 16.382, + "eval_loss": 0.7125306129455566, + "eval_runtime": 7352.67, + "eval_samples_per_second": 11.685, + "eval_steps_per_second": 1.461, + "step": 43024 + }, + { + "epoch": 1.01, + "learning_rate": 4.6812603055097536e-05, + "loss": 0.6434, + "step": 43500 + }, + { + "epoch": 1.02, + "learning_rate": 4.6773792116479486e-05, + "loss": 0.6436, + "step": 44000 + }, + { + "epoch": 1.03, + "learning_rate": 4.673490340042934e-05, + "loss": 0.651, + "step": 44500 + }, + { + "epoch": 1.05, + "learning_rate": 4.669601468437918e-05, + "loss": 0.6517, + "step": 45000 + }, + { + "epoch": 1.06, + "learning_rate": 4.665712596832903e-05, + "loss": 0.6416, + "step": 45500 + }, + { + "epoch": 1.07, + "learning_rate": 4.661823725227888e-05, + "loss": 0.6517, + "step": 46000 + }, + { + "epoch": 1.08, + "learning_rate": 4.657934853622873e-05, + "loss": 0.6539, + "step": 46500 + }, + { + "epoch": 1.09, + "learning_rate": 4.654045982017858e-05, + "loss": 0.6587, + "step": 47000 + }, + { + "epoch": 1.1, + "learning_rate": 4.6501571104128426e-05, + "loss": 0.6518, + "step": 47500 + }, + { + "epoch": 1.12, + "learning_rate": 4.646268238807828e-05, + "loss": 0.6465, + "step": 48000 + }, + { + "epoch": 1.13, + "learning_rate": 4.642379367202813e-05, + "loss": 0.6437, + "step": 48500 + }, + { + "epoch": 1.14, + "learning_rate": 4.6384904955977974e-05, + "loss": 0.6458, + "step": 49000 + }, + { + "epoch": 1.15, + "learning_rate": 4.6346016239927825e-05, + "loss": 0.6476, + "step": 49500 + }, + { + "epoch": 1.16, + "learning_rate": 4.630712752387768e-05, + "loss": 0.6587, + "step": 50000 + }, + { + "epoch": 1.17, + "learning_rate": 4.626823880782752e-05, + "loss": 0.6392, + "step": 50500 + }, + { + "epoch": 1.19, + "learning_rate": 4.622935009177737e-05, + "loss": 0.6547, + "step": 51000 + }, + { + "epoch": 1.2, + "learning_rate": 4.6190461375727225e-05, + "loss": 0.6511, + "step": 51500 + }, + { + "epoch": 1.21, + "learning_rate": 4.615157265967707e-05, + "loss": 0.6522, + "step": 52000 + }, + { + "epoch": 1.22, + "learning_rate": 4.611268394362692e-05, + "loss": 0.6555, + "step": 52500 + }, + { + "epoch": 1.23, + "learning_rate": 4.607395078244097e-05, + "loss": 0.6461, + "step": 53000 + }, + { + "epoch": 1.24, + "learning_rate": 4.6035062066390814e-05, + "loss": 0.6471, + "step": 53500 + }, + { + "epoch": 1.26, + "learning_rate": 4.5996173350340665e-05, + "loss": 0.6563, + "step": 54000 + }, + { + "epoch": 1.27, + "learning_rate": 4.5957284634290517e-05, + "loss": 0.6447, + "step": 54500 + }, + { + "epoch": 1.28, + "learning_rate": 4.591839591824036e-05, + "loss": 0.6483, + "step": 55000 + }, + { + "epoch": 1.29, + "learning_rate": 4.587958497962231e-05, + "loss": 0.6475, + "step": 55500 + }, + { + "epoch": 1.3, + "learning_rate": 4.584069626357216e-05, + "loss": 0.6546, + "step": 56000 + }, + { + "epoch": 1.31, + "learning_rate": 4.5801807547522014e-05, + "loss": 0.6528, + "step": 56500 + }, + { + "epoch": 1.32, + "learning_rate": 4.5762918831471866e-05, + "loss": 0.6538, + "step": 57000 + }, + { + "epoch": 1.34, + "learning_rate": 4.572403011542171e-05, + "loss": 0.6511, + "step": 57500 + }, + { + "epoch": 1.35, + "learning_rate": 4.568521917680366e-05, + "loss": 0.6457, + "step": 58000 + }, + { + "epoch": 1.36, + "learning_rate": 4.564633046075351e-05, + "loss": 0.659, + "step": 58500 + }, + { + "epoch": 1.37, + "learning_rate": 4.560744174470336e-05, + "loss": 0.6358, + "step": 59000 + }, + { + "epoch": 1.38, + "learning_rate": 4.556855302865321e-05, + "loss": 0.6425, + "step": 59500 + }, + { + "epoch": 1.39, + "learning_rate": 4.552966431260306e-05, + "loss": 0.644, + "step": 60000 + }, + { + "epoch": 1.41, + "learning_rate": 4.5490775596552904e-05, + "loss": 0.6537, + "step": 60500 + }, + { + "epoch": 1.42, + "learning_rate": 4.5451886880502756e-05, + "loss": 0.6529, + "step": 61000 + }, + { + "epoch": 1.43, + "learning_rate": 4.5413075941884705e-05, + "loss": 0.6547, + "step": 61500 + }, + { + "epoch": 1.44, + "learning_rate": 4.537418722583455e-05, + "loss": 0.6443, + "step": 62000 + }, + { + "epoch": 1.45, + "learning_rate": 4.53352985097844e-05, + "loss": 0.6516, + "step": 62500 + }, + { + "epoch": 1.46, + "learning_rate": 4.529640979373425e-05, + "loss": 0.6439, + "step": 63000 + }, + { + "epoch": 1.48, + "learning_rate": 4.52575210776841e-05, + "loss": 0.6401, + "step": 63500 + }, + { + "epoch": 1.49, + "learning_rate": 4.521863236163395e-05, + "loss": 0.6545, + "step": 64000 + }, + { + "epoch": 1.5, + "learning_rate": 4.51797436455838e-05, + "loss": 0.6433, + "step": 64500 + }, + { + "epoch": 1.51, + "learning_rate": 4.514085492953365e-05, + "loss": 0.6538, + "step": 65000 + }, + { + "epoch": 1.52, + "learning_rate": 4.51019662134835e-05, + "loss": 0.6548, + "step": 65500 + }, + { + "epoch": 1.53, + "learning_rate": 4.506307749743335e-05, + "loss": 0.6458, + "step": 66000 + }, + { + "epoch": 1.55, + "learning_rate": 4.50241887813832e-05, + "loss": 0.649, + "step": 66500 + }, + { + "epoch": 1.56, + "learning_rate": 4.4985300065333045e-05, + "loss": 0.6471, + "step": 67000 + }, + { + "epoch": 1.57, + "learning_rate": 4.4946411349282896e-05, + "loss": 0.6526, + "step": 67500 + }, + { + "epoch": 1.58, + "learning_rate": 4.4907600410664846e-05, + "loss": 0.646, + "step": 68000 + }, + { + "epoch": 1.59, + "learning_rate": 4.486878947204679e-05, + "loss": 0.6491, + "step": 68500 + }, + { + "epoch": 1.6, + "learning_rate": 4.482990075599664e-05, + "loss": 0.6484, + "step": 69000 + }, + { + "epoch": 1.62, + "learning_rate": 4.479101203994649e-05, + "loss": 0.6421, + "step": 69500 + }, + { + "epoch": 1.63, + "learning_rate": 4.475220110132844e-05, + "loss": 0.6498, + "step": 70000 + }, + { + "epoch": 1.64, + "learning_rate": 4.4713312385278286e-05, + "loss": 0.6497, + "step": 70500 + }, + { + "epoch": 1.65, + "learning_rate": 4.467442366922814e-05, + "loss": 0.6503, + "step": 71000 + }, + { + "epoch": 1.66, + "learning_rate": 4.463553495317799e-05, + "loss": 0.6314, + "step": 71500 + }, + { + "epoch": 1.67, + "learning_rate": 4.459672401455994e-05, + "loss": 0.6495, + "step": 72000 + }, + { + "epoch": 1.69, + "learning_rate": 4.4557835298509784e-05, + "loss": 0.643, + "step": 72500 + }, + { + "epoch": 1.7, + "learning_rate": 4.4518946582459635e-05, + "loss": 0.6542, + "step": 73000 + }, + { + "epoch": 1.71, + "learning_rate": 4.448005786640949e-05, + "loss": 0.6522, + "step": 73500 + }, + { + "epoch": 1.72, + "learning_rate": 4.444116915035934e-05, + "loss": 0.6476, + "step": 74000 + }, + { + "epoch": 1.73, + "learning_rate": 4.440228043430918e-05, + "loss": 0.6337, + "step": 74500 + }, + { + "epoch": 1.74, + "learning_rate": 4.4363391718259035e-05, + "loss": 0.6546, + "step": 75000 + }, + { + "epoch": 1.75, + "learning_rate": 4.4324503002208886e-05, + "loss": 0.647, + "step": 75500 + }, + { + "epoch": 1.77, + "learning_rate": 4.4285614286158724e-05, + "loss": 0.6467, + "step": 76000 + }, + { + "epoch": 1.78, + "learning_rate": 4.4246725570108576e-05, + "loss": 0.6432, + "step": 76500 + }, + { + "epoch": 1.79, + "learning_rate": 4.420783685405843e-05, + "loss": 0.6399, + "step": 77000 + }, + { + "epoch": 1.8, + "learning_rate": 4.416894813800828e-05, + "loss": 0.6382, + "step": 77500 + }, + { + "epoch": 1.81, + "learning_rate": 4.4130059421958123e-05, + "loss": 0.649, + "step": 78000 + }, + { + "epoch": 1.82, + "learning_rate": 4.409124848334007e-05, + "loss": 0.6385, + "step": 78500 + }, + { + "epoch": 1.84, + "learning_rate": 4.4052359767289925e-05, + "loss": 0.6448, + "step": 79000 + }, + { + "epoch": 1.85, + "learning_rate": 4.4013471051239776e-05, + "loss": 0.638, + "step": 79500 + }, + { + "epoch": 1.86, + "learning_rate": 4.397458233518962e-05, + "loss": 0.6317, + "step": 80000 + }, + { + "epoch": 1.87, + "learning_rate": 4.393569361913947e-05, + "loss": 0.6338, + "step": 80500 + }, + { + "epoch": 1.88, + "learning_rate": 4.3896804903089324e-05, + "loss": 0.6406, + "step": 81000 + }, + { + "epoch": 1.89, + "learning_rate": 4.385791618703917e-05, + "loss": 0.6363, + "step": 81500 + }, + { + "epoch": 1.91, + "learning_rate": 4.381902747098902e-05, + "loss": 0.6381, + "step": 82000 + }, + { + "epoch": 1.92, + "learning_rate": 4.378021653237097e-05, + "loss": 0.6383, + "step": 82500 + }, + { + "epoch": 1.93, + "learning_rate": 4.374140559375292e-05, + "loss": 0.6351, + "step": 83000 + }, + { + "epoch": 1.94, + "learning_rate": 4.370251687770277e-05, + "loss": 0.642, + "step": 83500 + }, + { + "epoch": 1.95, + "learning_rate": 4.3663705939084714e-05, + "loss": 0.6399, + "step": 84000 + }, + { + "epoch": 1.96, + "learning_rate": 4.3624817223034566e-05, + "loss": 0.6351, + "step": 84500 + }, + { + "epoch": 1.98, + "learning_rate": 4.358592850698442e-05, + "loss": 0.631, + "step": 85000 + }, + { + "epoch": 1.99, + "learning_rate": 4.354703979093426e-05, + "loss": 0.6298, + "step": 85500 + }, + { + "epoch": 2.0, + "learning_rate": 4.350822885231621e-05, + "loss": 0.6349, + "step": 86000 + }, + { + "epoch": 2.0, + "eval_bleu": 58.202, + "eval_gen_len": 15.9466, + "eval_loss": 0.6546894311904907, + "eval_runtime": 6844.8665, + "eval_samples_per_second": 12.552, + "eval_steps_per_second": 1.569, + "step": 86048 + }, + { + "epoch": 2.01, + "learning_rate": 4.346934013626606e-05, + "loss": 0.5093, + "step": 86500 + }, + { + "epoch": 2.02, + "learning_rate": 4.3430451420215914e-05, + "loss": 0.5066, + "step": 87000 + }, + { + "epoch": 2.03, + "learning_rate": 4.339156270416576e-05, + "loss": 0.4956, + "step": 87500 + }, + { + "epoch": 2.05, + "learning_rate": 4.335267398811561e-05, + "loss": 0.5, + "step": 88000 + }, + { + "epoch": 2.06, + "learning_rate": 4.331378527206546e-05, + "loss": 0.5034, + "step": 88500 + }, + { + "epoch": 2.07, + "learning_rate": 4.327489655601531e-05, + "loss": 0.505, + "step": 89000 + }, + { + "epoch": 2.08, + "learning_rate": 4.323600783996516e-05, + "loss": 0.4995, + "step": 89500 + }, + { + "epoch": 2.09, + "learning_rate": 4.319719690134711e-05, + "loss": 0.5064, + "step": 90000 + }, + { + "epoch": 2.1, + "learning_rate": 4.315830818529696e-05, + "loss": 0.5065, + "step": 90500 + }, + { + "epoch": 2.12, + "learning_rate": 4.311949724667891e-05, + "loss": 0.5031, + "step": 91000 + }, + { + "epoch": 2.13, + "learning_rate": 4.3080608530628754e-05, + "loss": 0.4967, + "step": 91500 + }, + { + "epoch": 2.14, + "learning_rate": 4.30417198145786e-05, + "loss": 0.5112, + "step": 92000 + }, + { + "epoch": 2.15, + "learning_rate": 4.300290887596055e-05, + "loss": 0.5069, + "step": 92500 + }, + { + "epoch": 2.16, + "learning_rate": 4.29640201599104e-05, + "loss": 0.5106, + "step": 93000 + }, + { + "epoch": 2.17, + "learning_rate": 4.292513144386025e-05, + "loss": 0.5077, + "step": 93500 + }, + { + "epoch": 2.18, + "learning_rate": 4.28862427278101e-05, + "loss": 0.5179, + "step": 94000 + }, + { + "epoch": 2.2, + "learning_rate": 4.284735401175995e-05, + "loss": 0.5192, + "step": 94500 + }, + { + "epoch": 2.21, + "learning_rate": 4.28084652957098e-05, + "loss": 0.5103, + "step": 95000 + }, + { + "epoch": 2.22, + "learning_rate": 4.276957657965965e-05, + "loss": 0.5203, + "step": 95500 + }, + { + "epoch": 2.23, + "learning_rate": 4.2730687863609496e-05, + "loss": 0.5179, + "step": 96000 + }, + { + "epoch": 2.24, + "learning_rate": 4.269179914755935e-05, + "loss": 0.515, + "step": 96500 + }, + { + "epoch": 2.25, + "learning_rate": 4.26529104315092e-05, + "loss": 0.5133, + "step": 97000 + }, + { + "epoch": 2.27, + "learning_rate": 4.2614021715459043e-05, + "loss": 0.5147, + "step": 97500 + }, + { + "epoch": 2.28, + "learning_rate": 4.2575132999408895e-05, + "loss": 0.5308, + "step": 98000 + }, + { + "epoch": 2.29, + "learning_rate": 4.2536322060790845e-05, + "loss": 0.5207, + "step": 98500 + }, + { + "epoch": 2.3, + "learning_rate": 4.2497511122172794e-05, + "loss": 0.5228, + "step": 99000 + }, + { + "epoch": 2.31, + "learning_rate": 4.245862240612264e-05, + "loss": 0.5207, + "step": 99500 + }, + { + "epoch": 2.32, + "learning_rate": 4.241973369007249e-05, + "loss": 0.5211, + "step": 100000 + }, + { + "epoch": 2.34, + "learning_rate": 4.2380844974022335e-05, + "loss": 0.5169, + "step": 100500 + }, + { + "epoch": 2.35, + "learning_rate": 4.234195625797219e-05, + "loss": 0.5302, + "step": 101000 + }, + { + "epoch": 2.36, + "learning_rate": 4.230306754192204e-05, + "loss": 0.5155, + "step": 101500 + }, + { + "epoch": 2.37, + "learning_rate": 4.226417882587188e-05, + "loss": 0.5193, + "step": 102000 + }, + { + "epoch": 2.38, + "learning_rate": 4.2225290109821735e-05, + "loss": 0.518, + "step": 102500 + }, + { + "epoch": 2.39, + "learning_rate": 4.2186401393771586e-05, + "loss": 0.5171, + "step": 103000 + }, + { + "epoch": 2.41, + "learning_rate": 4.214751267772144e-05, + "loss": 0.5208, + "step": 103500 + }, + { + "epoch": 2.42, + "learning_rate": 4.210862396167128e-05, + "loss": 0.5231, + "step": 104000 + }, + { + "epoch": 2.43, + "learning_rate": 4.2069735245621134e-05, + "loss": 0.5267, + "step": 104500 + }, + { + "epoch": 2.44, + "learning_rate": 4.2030846529570985e-05, + "loss": 0.5196, + "step": 105000 + }, + { + "epoch": 2.45, + "learning_rate": 4.199195781352083e-05, + "loss": 0.5234, + "step": 105500 + }, + { + "epoch": 2.46, + "learning_rate": 4.1953224652334885e-05, + "loss": 0.526, + "step": 106000 + }, + { + "epoch": 2.48, + "learning_rate": 4.191449149114893e-05, + "loss": 0.5312, + "step": 106500 + }, + { + "epoch": 2.49, + "learning_rate": 4.187560277509878e-05, + "loss": 0.5286, + "step": 107000 + }, + { + "epoch": 2.5, + "learning_rate": 4.183671405904863e-05, + "loss": 0.5225, + "step": 107500 + }, + { + "epoch": 2.51, + "learning_rate": 4.1797825342998474e-05, + "loss": 0.5207, + "step": 108000 + }, + { + "epoch": 2.52, + "learning_rate": 4.1758936626948325e-05, + "loss": 0.5263, + "step": 108500 + }, + { + "epoch": 2.53, + "learning_rate": 4.172004791089818e-05, + "loss": 0.5243, + "step": 109000 + }, + { + "epoch": 2.55, + "learning_rate": 4.1681236972280126e-05, + "loss": 0.5332, + "step": 109500 + }, + { + "epoch": 2.56, + "learning_rate": 4.164234825622997e-05, + "loss": 0.5297, + "step": 110000 + }, + { + "epoch": 2.57, + "learning_rate": 4.160345954017982e-05, + "loss": 0.5274, + "step": 110500 + }, + { + "epoch": 2.58, + "learning_rate": 4.1564570824129674e-05, + "loss": 0.5241, + "step": 111000 + }, + { + "epoch": 2.59, + "learning_rate": 4.152568210807952e-05, + "loss": 0.528, + "step": 111500 + }, + { + "epoch": 2.6, + "learning_rate": 4.148679339202937e-05, + "loss": 0.53, + "step": 112000 + }, + { + "epoch": 2.61, + "learning_rate": 4.144790467597922e-05, + "loss": 0.5291, + "step": 112500 + }, + { + "epoch": 2.63, + "learning_rate": 4.1409015959929073e-05, + "loss": 0.5431, + "step": 113000 + }, + { + "epoch": 2.64, + "learning_rate": 4.137012724387892e-05, + "loss": 0.5336, + "step": 113500 + }, + { + "epoch": 2.65, + "learning_rate": 4.133123852782877e-05, + "loss": 0.531, + "step": 114000 + }, + { + "epoch": 2.66, + "learning_rate": 4.129234981177862e-05, + "loss": 0.5278, + "step": 114500 + }, + { + "epoch": 2.67, + "learning_rate": 4.1253461095728466e-05, + "loss": 0.5214, + "step": 115000 + }, + { + "epoch": 2.68, + "learning_rate": 4.121457237967831e-05, + "loss": 0.5249, + "step": 115500 + }, + { + "epoch": 2.7, + "learning_rate": 4.117568366362816e-05, + "loss": 0.5351, + "step": 116000 + }, + { + "epoch": 2.71, + "learning_rate": 4.1136794947578014e-05, + "loss": 0.5275, + "step": 116500 + }, + { + "epoch": 2.72, + "learning_rate": 4.109790623152786e-05, + "loss": 0.5309, + "step": 117000 + }, + { + "epoch": 2.73, + "learning_rate": 4.105901751547771e-05, + "loss": 0.5291, + "step": 117500 + }, + { + "epoch": 2.74, + "learning_rate": 4.102012879942756e-05, + "loss": 0.5274, + "step": 118000 + }, + { + "epoch": 2.75, + "learning_rate": 4.098131786080951e-05, + "loss": 0.532, + "step": 118500 + }, + { + "epoch": 2.77, + "learning_rate": 4.0942429144759356e-05, + "loss": 0.5278, + "step": 119000 + }, + { + "epoch": 2.78, + "learning_rate": 4.090354042870921e-05, + "loss": 0.5338, + "step": 119500 + }, + { + "epoch": 2.79, + "learning_rate": 4.086465171265906e-05, + "loss": 0.5289, + "step": 120000 + }, + { + "epoch": 2.8, + "learning_rate": 4.082576299660891e-05, + "loss": 0.5388, + "step": 120500 + }, + { + "epoch": 2.81, + "learning_rate": 4.0786874280558755e-05, + "loss": 0.5337, + "step": 121000 + }, + { + "epoch": 2.82, + "learning_rate": 4.074798556450861e-05, + "loss": 0.5308, + "step": 121500 + }, + { + "epoch": 2.84, + "learning_rate": 4.0709174625890556e-05, + "loss": 0.5294, + "step": 122000 + }, + { + "epoch": 2.85, + "learning_rate": 4.067028590984041e-05, + "loss": 0.5385, + "step": 122500 + }, + { + "epoch": 2.86, + "learning_rate": 4.063139719379025e-05, + "loss": 0.5379, + "step": 123000 + }, + { + "epoch": 2.87, + "learning_rate": 4.05925084777401e-05, + "loss": 0.5485, + "step": 123500 + }, + { + "epoch": 2.88, + "learning_rate": 4.055361976168995e-05, + "loss": 0.5391, + "step": 124000 + }, + { + "epoch": 2.89, + "learning_rate": 4.05147310456398e-05, + "loss": 0.5414, + "step": 124500 + }, + { + "epoch": 2.91, + "learning_rate": 4.0475842329589645e-05, + "loss": 0.5237, + "step": 125000 + }, + { + "epoch": 2.92, + "learning_rate": 4.04369536135395e-05, + "loss": 0.5363, + "step": 125500 + }, + { + "epoch": 2.93, + "learning_rate": 4.039806489748935e-05, + "loss": 0.543, + "step": 126000 + }, + { + "epoch": 2.94, + "learning_rate": 4.0359331736303396e-05, + "loss": 0.5265, + "step": 126500 + }, + { + "epoch": 2.95, + "learning_rate": 4.032044302025325e-05, + "loss": 0.5379, + "step": 127000 + }, + { + "epoch": 2.96, + "learning_rate": 4.028155430420309e-05, + "loss": 0.5311, + "step": 127500 + }, + { + "epoch": 2.98, + "learning_rate": 4.0242665588152944e-05, + "loss": 0.5361, + "step": 128000 + }, + { + "epoch": 2.99, + "learning_rate": 4.0203776872102795e-05, + "loss": 0.5405, + "step": 128500 + }, + { + "epoch": 3.0, + "learning_rate": 4.016488815605265e-05, + "loss": 0.537, + "step": 129000 + }, + { + "epoch": 3.0, + "eval_bleu": 59.1835, + "eval_gen_len": 15.7226, + "eval_loss": 0.6417234539985657, + "eval_runtime": 6645.2011, + "eval_samples_per_second": 12.929, + "eval_steps_per_second": 1.616, + "step": 129072 + }, + { + "epoch": 3.01, + "learning_rate": 4.0126077217434597e-05, + "loss": 0.4217, + "step": 129500 + }, + { + "epoch": 3.02, + "learning_rate": 4.008718850138444e-05, + "loss": 0.3942, + "step": 130000 + }, + { + "epoch": 3.03, + "learning_rate": 4.0048299785334286e-05, + "loss": 0.3948, + "step": 130500 + }, + { + "epoch": 3.04, + "learning_rate": 4.0009488846716236e-05, + "loss": 0.3895, + "step": 131000 + }, + { + "epoch": 3.06, + "learning_rate": 3.9970677908098185e-05, + "loss": 0.3912, + "step": 131500 + }, + { + "epoch": 3.07, + "learning_rate": 3.993178919204804e-05, + "loss": 0.3989, + "step": 132000 + }, + { + "epoch": 3.08, + "learning_rate": 3.989290047599789e-05, + "loss": 0.3975, + "step": 132500 + }, + { + "epoch": 3.09, + "learning_rate": 3.985401175994773e-05, + "loss": 0.4036, + "step": 133000 + }, + { + "epoch": 3.1, + "learning_rate": 3.9815123043897585e-05, + "loss": 0.4028, + "step": 133500 + }, + { + "epoch": 3.11, + "learning_rate": 3.9776234327847436e-05, + "loss": 0.3985, + "step": 134000 + }, + { + "epoch": 3.13, + "learning_rate": 3.973734561179728e-05, + "loss": 0.4002, + "step": 134500 + }, + { + "epoch": 3.14, + "learning_rate": 3.969845689574713e-05, + "loss": 0.3975, + "step": 135000 + }, + { + "epoch": 3.15, + "learning_rate": 3.9659568179696984e-05, + "loss": 0.4024, + "step": 135500 + }, + { + "epoch": 3.16, + "learning_rate": 3.962067946364683e-05, + "loss": 0.4016, + "step": 136000 + }, + { + "epoch": 3.17, + "learning_rate": 3.958179074759668e-05, + "loss": 0.4084, + "step": 136500 + }, + { + "epoch": 3.18, + "learning_rate": 3.954290203154653e-05, + "loss": 0.4054, + "step": 137000 + }, + { + "epoch": 3.2, + "learning_rate": 3.9504013315496377e-05, + "loss": 0.4061, + "step": 137500 + }, + { + "epoch": 3.21, + "learning_rate": 3.946512459944623e-05, + "loss": 0.4098, + "step": 138000 + }, + { + "epoch": 3.22, + "learning_rate": 3.942623588339607e-05, + "loss": 0.4115, + "step": 138500 + }, + { + "epoch": 3.23, + "learning_rate": 3.9387347167345924e-05, + "loss": 0.4068, + "step": 139000 + }, + { + "epoch": 3.24, + "learning_rate": 3.9348536228727874e-05, + "loss": 0.4058, + "step": 139500 + }, + { + "epoch": 3.25, + "learning_rate": 3.930964751267772e-05, + "loss": 0.4059, + "step": 140000 + }, + { + "epoch": 3.27, + "learning_rate": 3.927075879662757e-05, + "loss": 0.4145, + "step": 140500 + }, + { + "epoch": 3.28, + "learning_rate": 3.9232025635441625e-05, + "loss": 0.4104, + "step": 141000 + }, + { + "epoch": 3.29, + "learning_rate": 3.919313691939147e-05, + "loss": 0.4141, + "step": 141500 + }, + { + "epoch": 3.3, + "learning_rate": 3.915424820334132e-05, + "loss": 0.4158, + "step": 142000 + }, + { + "epoch": 3.31, + "learning_rate": 3.911535948729117e-05, + "loss": 0.4115, + "step": 142500 + }, + { + "epoch": 3.32, + "learning_rate": 3.907647077124102e-05, + "loss": 0.4197, + "step": 143000 + }, + { + "epoch": 3.34, + "learning_rate": 3.903758205519087e-05, + "loss": 0.4082, + "step": 143500 + }, + { + "epoch": 3.35, + "learning_rate": 3.899869333914072e-05, + "loss": 0.4231, + "step": 144000 + }, + { + "epoch": 3.36, + "learning_rate": 3.8959804623090565e-05, + "loss": 0.4237, + "step": 144500 + }, + { + "epoch": 3.37, + "learning_rate": 3.892091590704042e-05, + "loss": 0.4162, + "step": 145000 + }, + { + "epoch": 3.38, + "learning_rate": 3.888202719099027e-05, + "loss": 0.4189, + "step": 145500 + }, + { + "epoch": 3.39, + "learning_rate": 3.884321625237221e-05, + "loss": 0.4154, + "step": 146000 + }, + { + "epoch": 3.41, + "learning_rate": 3.880432753632206e-05, + "loss": 0.4193, + "step": 146500 + }, + { + "epoch": 3.42, + "learning_rate": 3.876543882027191e-05, + "loss": 0.4238, + "step": 147000 + }, + { + "epoch": 3.43, + "learning_rate": 3.872655010422176e-05, + "loss": 0.4274, + "step": 147500 + }, + { + "epoch": 3.44, + "learning_rate": 3.868766138817161e-05, + "loss": 0.4165, + "step": 148000 + }, + { + "epoch": 3.45, + "learning_rate": 3.8648772672121455e-05, + "loss": 0.4223, + "step": 148500 + }, + { + "epoch": 3.46, + "learning_rate": 3.860988395607131e-05, + "loss": 0.4213, + "step": 149000 + }, + { + "epoch": 3.47, + "learning_rate": 3.8571073017453256e-05, + "loss": 0.422, + "step": 149500 + }, + { + "epoch": 3.49, + "learning_rate": 3.853218430140311e-05, + "loss": 0.4245, + "step": 150000 + }, + { + "epoch": 3.5, + "learning_rate": 3.849329558535296e-05, + "loss": 0.4247, + "step": 150500 + }, + { + "epoch": 3.51, + "learning_rate": 3.8454406869302804e-05, + "loss": 0.4267, + "step": 151000 + }, + { + "epoch": 3.52, + "learning_rate": 3.8415518153252656e-05, + "loss": 0.4253, + "step": 151500 + }, + { + "epoch": 3.53, + "learning_rate": 3.837662943720251e-05, + "loss": 0.4311, + "step": 152000 + }, + { + "epoch": 3.54, + "learning_rate": 3.833774072115235e-05, + "loss": 0.4204, + "step": 152500 + }, + { + "epoch": 3.56, + "learning_rate": 3.8298852005102203e-05, + "loss": 0.4246, + "step": 153000 + }, + { + "epoch": 3.57, + "learning_rate": 3.8259963289052055e-05, + "loss": 0.4424, + "step": 153500 + }, + { + "epoch": 3.58, + "learning_rate": 3.82210745730019e-05, + "loss": 0.426, + "step": 154000 + }, + { + "epoch": 3.59, + "learning_rate": 3.8182185856951744e-05, + "loss": 0.4304, + "step": 154500 + }, + { + "epoch": 3.6, + "learning_rate": 3.8143374918333694e-05, + "loss": 0.4292, + "step": 155000 + }, + { + "epoch": 3.61, + "learning_rate": 3.8104486202283546e-05, + "loss": 0.426, + "step": 155500 + }, + { + "epoch": 3.63, + "learning_rate": 3.80655974862334e-05, + "loss": 0.4247, + "step": 156000 + }, + { + "epoch": 3.64, + "learning_rate": 3.802670877018324e-05, + "loss": 0.4271, + "step": 156500 + }, + { + "epoch": 3.65, + "learning_rate": 3.798789783156519e-05, + "loss": 0.4279, + "step": 157000 + }, + { + "epoch": 3.66, + "learning_rate": 3.794900911551504e-05, + "loss": 0.4357, + "step": 157500 + }, + { + "epoch": 3.67, + "learning_rate": 3.791019817689699e-05, + "loss": 0.435, + "step": 158000 + }, + { + "epoch": 3.68, + "learning_rate": 3.7871309460846844e-05, + "loss": 0.4336, + "step": 158500 + }, + { + "epoch": 3.7, + "learning_rate": 3.7832420744796696e-05, + "loss": 0.4303, + "step": 159000 + }, + { + "epoch": 3.71, + "learning_rate": 3.779353202874654e-05, + "loss": 0.4231, + "step": 159500 + }, + { + "epoch": 3.72, + "learning_rate": 3.775464331269639e-05, + "loss": 0.4303, + "step": 160000 + }, + { + "epoch": 3.73, + "learning_rate": 3.7715754596646244e-05, + "loss": 0.4301, + "step": 160500 + }, + { + "epoch": 3.74, + "learning_rate": 3.767686588059609e-05, + "loss": 0.4317, + "step": 161000 + }, + { + "epoch": 3.75, + "learning_rate": 3.763797716454593e-05, + "loss": 0.4421, + "step": 161500 + }, + { + "epoch": 3.77, + "learning_rate": 3.7599088448495785e-05, + "loss": 0.4338, + "step": 162000 + }, + { + "epoch": 3.78, + "learning_rate": 3.7560199732445636e-05, + "loss": 0.4315, + "step": 162500 + }, + { + "epoch": 3.79, + "learning_rate": 3.752131101639548e-05, + "loss": 0.4318, + "step": 163000 + }, + { + "epoch": 3.8, + "learning_rate": 3.748242230034533e-05, + "loss": 0.4378, + "step": 163500 + }, + { + "epoch": 3.81, + "learning_rate": 3.7443533584295184e-05, + "loss": 0.438, + "step": 164000 + }, + { + "epoch": 3.82, + "learning_rate": 3.740464486824503e-05, + "loss": 0.4385, + "step": 164500 + }, + { + "epoch": 3.84, + "learning_rate": 3.736575615219488e-05, + "loss": 0.4365, + "step": 165000 + }, + { + "epoch": 3.85, + "learning_rate": 3.732686743614473e-05, + "loss": 0.4286, + "step": 165500 + }, + { + "epoch": 3.86, + "learning_rate": 3.7287978720094576e-05, + "loss": 0.4369, + "step": 166000 + }, + { + "epoch": 3.87, + "learning_rate": 3.7249167781476526e-05, + "loss": 0.4361, + "step": 166500 + }, + { + "epoch": 3.88, + "learning_rate": 3.721027906542638e-05, + "loss": 0.4371, + "step": 167000 + }, + { + "epoch": 3.89, + "learning_rate": 3.717139034937623e-05, + "loss": 0.4351, + "step": 167500 + }, + { + "epoch": 3.9, + "learning_rate": 3.713257941075818e-05, + "loss": 0.4361, + "step": 168000 + }, + { + "epoch": 3.92, + "learning_rate": 3.709376847214013e-05, + "loss": 0.434, + "step": 168500 + }, + { + "epoch": 3.93, + "learning_rate": 3.705487975608998e-05, + "loss": 0.4408, + "step": 169000 + }, + { + "epoch": 3.94, + "learning_rate": 3.701606881747192e-05, + "loss": 0.4398, + "step": 169500 + }, + { + "epoch": 3.95, + "learning_rate": 3.6977180101421774e-05, + "loss": 0.4396, + "step": 170000 + }, + { + "epoch": 3.96, + "learning_rate": 3.693829138537162e-05, + "loss": 0.438, + "step": 170500 + }, + { + "epoch": 3.97, + "learning_rate": 3.689940266932147e-05, + "loss": 0.4374, + "step": 171000 + }, + { + "epoch": 3.99, + "learning_rate": 3.686051395327132e-05, + "loss": 0.4407, + "step": 171500 + }, + { + "epoch": 4.0, + "learning_rate": 3.682162523722117e-05, + "loss": 0.434, + "step": 172000 + }, + { + "epoch": 4.0, + "eval_bleu": 59.6194, + "eval_gen_len": 15.702, + "eval_loss": 0.6589247584342957, + "eval_runtime": 6589.9687, + "eval_samples_per_second": 13.037, + "eval_steps_per_second": 1.63, + "step": 172096 + }, + { + "epoch": 4.01, + "learning_rate": 3.678273652117102e-05, + "loss": 0.3325, + "step": 172500 + }, + { + "epoch": 4.02, + "learning_rate": 3.674384780512087e-05, + "loss": 0.3007, + "step": 173000 + }, + { + "epoch": 4.03, + "learning_rate": 3.6704959089070715e-05, + "loss": 0.305, + "step": 173500 + }, + { + "epoch": 4.04, + "learning_rate": 3.6666070373020566e-05, + "loss": 0.3016, + "step": 174000 + }, + { + "epoch": 4.06, + "learning_rate": 3.662718165697042e-05, + "loss": 0.2961, + "step": 174500 + }, + { + "epoch": 4.07, + "learning_rate": 3.6588526273216564e-05, + "loss": 0.3007, + "step": 175000 + }, + { + "epoch": 4.08, + "learning_rate": 3.6549637557166415e-05, + "loss": 0.3012, + "step": 175500 + }, + { + "epoch": 4.09, + "learning_rate": 3.651074884111627e-05, + "loss": 0.3094, + "step": 176000 + }, + { + "epoch": 4.1, + "learning_rate": 3.647186012506612e-05, + "loss": 0.3069, + "step": 176500 + }, + { + "epoch": 4.11, + "learning_rate": 3.6432971409015956e-05, + "loss": 0.3091, + "step": 177000 + }, + { + "epoch": 4.13, + "learning_rate": 3.639408269296581e-05, + "loss": 0.3147, + "step": 177500 + }, + { + "epoch": 4.14, + "learning_rate": 3.635519397691566e-05, + "loss": 0.3112, + "step": 178000 + }, + { + "epoch": 4.15, + "learning_rate": 3.6316305260865504e-05, + "loss": 0.3129, + "step": 178500 + }, + { + "epoch": 4.16, + "learning_rate": 3.6277416544815356e-05, + "loss": 0.309, + "step": 179000 + }, + { + "epoch": 4.17, + "learning_rate": 3.623852782876521e-05, + "loss": 0.3195, + "step": 179500 + }, + { + "epoch": 4.18, + "learning_rate": 3.619963911271506e-05, + "loss": 0.3219, + "step": 180000 + }, + { + "epoch": 4.2, + "learning_rate": 3.6160750396664903e-05, + "loss": 0.3134, + "step": 180500 + }, + { + "epoch": 4.21, + "learning_rate": 3.6121861680614755e-05, + "loss": 0.3182, + "step": 181000 + }, + { + "epoch": 4.22, + "learning_rate": 3.6083050741996705e-05, + "loss": 0.3131, + "step": 181500 + }, + { + "epoch": 4.23, + "learning_rate": 3.6044162025946556e-05, + "loss": 0.3193, + "step": 182000 + }, + { + "epoch": 4.24, + "learning_rate": 3.6005351087328506e-05, + "loss": 0.3153, + "step": 182500 + }, + { + "epoch": 4.25, + "learning_rate": 3.596646237127835e-05, + "loss": 0.3264, + "step": 183000 + }, + { + "epoch": 4.27, + "learning_rate": 3.59276514326603e-05, + "loss": 0.3195, + "step": 183500 + }, + { + "epoch": 4.28, + "learning_rate": 3.588876271661015e-05, + "loss": 0.3197, + "step": 184000 + }, + { + "epoch": 4.29, + "learning_rate": 3.5849874000559996e-05, + "loss": 0.3235, + "step": 184500 + }, + { + "epoch": 4.3, + "learning_rate": 3.581098528450985e-05, + "loss": 0.3188, + "step": 185000 + }, + { + "epoch": 4.31, + "learning_rate": 3.577209656845969e-05, + "loss": 0.3183, + "step": 185500 + }, + { + "epoch": 4.32, + "learning_rate": 3.5733207852409544e-05, + "loss": 0.3271, + "step": 186000 + }, + { + "epoch": 4.33, + "learning_rate": 3.5694319136359396e-05, + "loss": 0.3333, + "step": 186500 + }, + { + "epoch": 4.35, + "learning_rate": 3.565543042030924e-05, + "loss": 0.3216, + "step": 187000 + }, + { + "epoch": 4.36, + "learning_rate": 3.561654170425909e-05, + "loss": 0.3296, + "step": 187500 + }, + { + "epoch": 4.37, + "learning_rate": 3.5577652988208944e-05, + "loss": 0.3282, + "step": 188000 + }, + { + "epoch": 4.38, + "learning_rate": 3.5538764272158795e-05, + "loss": 0.3297, + "step": 188500 + }, + { + "epoch": 4.39, + "learning_rate": 3.549987555610864e-05, + "loss": 0.3283, + "step": 189000 + }, + { + "epoch": 4.4, + "learning_rate": 3.546106461749059e-05, + "loss": 0.3294, + "step": 189500 + }, + { + "epoch": 4.42, + "learning_rate": 3.542217590144044e-05, + "loss": 0.3275, + "step": 190000 + }, + { + "epoch": 4.43, + "learning_rate": 3.538328718539029e-05, + "loss": 0.324, + "step": 190500 + }, + { + "epoch": 4.44, + "learning_rate": 3.534439846934014e-05, + "loss": 0.3285, + "step": 191000 + }, + { + "epoch": 4.45, + "learning_rate": 3.530558753072209e-05, + "loss": 0.3338, + "step": 191500 + }, + { + "epoch": 4.46, + "learning_rate": 3.526669881467194e-05, + "loss": 0.3347, + "step": 192000 + }, + { + "epoch": 4.47, + "learning_rate": 3.522781009862178e-05, + "loss": 0.338, + "step": 192500 + }, + { + "epoch": 4.49, + "learning_rate": 3.5188921382571635e-05, + "loss": 0.3312, + "step": 193000 + }, + { + "epoch": 4.5, + "learning_rate": 3.515003266652148e-05, + "loss": 0.3364, + "step": 193500 + }, + { + "epoch": 4.51, + "learning_rate": 3.511114395047133e-05, + "loss": 0.3347, + "step": 194000 + }, + { + "epoch": 4.52, + "learning_rate": 3.507225523442118e-05, + "loss": 0.3334, + "step": 194500 + }, + { + "epoch": 4.53, + "learning_rate": 3.503336651837103e-05, + "loss": 0.3319, + "step": 195000 + }, + { + "epoch": 4.54, + "learning_rate": 3.499455557975298e-05, + "loss": 0.3332, + "step": 195500 + }, + { + "epoch": 4.56, + "learning_rate": 3.495566686370283e-05, + "loss": 0.3418, + "step": 196000 + }, + { + "epoch": 4.57, + "learning_rate": 3.491677814765268e-05, + "loss": 0.3402, + "step": 196500 + }, + { + "epoch": 4.58, + "learning_rate": 3.487796720903463e-05, + "loss": 0.3342, + "step": 197000 + }, + { + "epoch": 4.59, + "learning_rate": 3.483907849298448e-05, + "loss": 0.3294, + "step": 197500 + }, + { + "epoch": 4.6, + "learning_rate": 3.4800189776934326e-05, + "loss": 0.3393, + "step": 198000 + }, + { + "epoch": 4.61, + "learning_rate": 3.476130106088418e-05, + "loss": 0.3446, + "step": 198500 + }, + { + "epoch": 4.63, + "learning_rate": 3.472241234483403e-05, + "loss": 0.3375, + "step": 199000 + }, + { + "epoch": 4.64, + "learning_rate": 3.4683523628783874e-05, + "loss": 0.3399, + "step": 199500 + }, + { + "epoch": 4.65, + "learning_rate": 3.4644634912733725e-05, + "loss": 0.3405, + "step": 200000 + }, + { + "epoch": 4.66, + "learning_rate": 3.460574619668357e-05, + "loss": 0.3378, + "step": 200500 + }, + { + "epoch": 4.67, + "learning_rate": 3.456693525806552e-05, + "loss": 0.346, + "step": 201000 + }, + { + "epoch": 4.68, + "learning_rate": 3.452804654201537e-05, + "loss": 0.3455, + "step": 201500 + }, + { + "epoch": 4.7, + "learning_rate": 3.4489157825965216e-05, + "loss": 0.3432, + "step": 202000 + }, + { + "epoch": 4.71, + "learning_rate": 3.4450346887347166e-05, + "loss": 0.3505, + "step": 202500 + }, + { + "epoch": 4.72, + "learning_rate": 3.441145817129702e-05, + "loss": 0.3432, + "step": 203000 + }, + { + "epoch": 4.73, + "learning_rate": 3.437256945524687e-05, + "loss": 0.3405, + "step": 203500 + }, + { + "epoch": 4.74, + "learning_rate": 3.433368073919671e-05, + "loss": 0.3394, + "step": 204000 + }, + { + "epoch": 4.75, + "learning_rate": 3.4294792023146565e-05, + "loss": 0.3477, + "step": 204500 + }, + { + "epoch": 4.76, + "learning_rate": 3.4255903307096416e-05, + "loss": 0.352, + "step": 205000 + }, + { + "epoch": 4.78, + "learning_rate": 3.421701459104627e-05, + "loss": 0.3442, + "step": 205500 + }, + { + "epoch": 4.79, + "learning_rate": 3.417812587499611e-05, + "loss": 0.3487, + "step": 206000 + }, + { + "epoch": 4.8, + "learning_rate": 3.4139237158945964e-05, + "loss": 0.346, + "step": 206500 + }, + { + "epoch": 4.81, + "learning_rate": 3.4100348442895816e-05, + "loss": 0.3533, + "step": 207000 + }, + { + "epoch": 4.82, + "learning_rate": 3.406145972684566e-05, + "loss": 0.3501, + "step": 207500 + }, + { + "epoch": 4.83, + "learning_rate": 3.4022571010795505e-05, + "loss": 0.3472, + "step": 208000 + }, + { + "epoch": 4.85, + "learning_rate": 3.398368229474536e-05, + "loss": 0.351, + "step": 208500 + }, + { + "epoch": 4.86, + "learning_rate": 3.3944871356127306e-05, + "loss": 0.3415, + "step": 209000 + }, + { + "epoch": 4.87, + "learning_rate": 3.390598264007716e-05, + "loss": 0.3478, + "step": 209500 + }, + { + "epoch": 4.88, + "learning_rate": 3.386717170145911e-05, + "loss": 0.3475, + "step": 210000 + }, + { + "epoch": 4.89, + "learning_rate": 3.382828298540895e-05, + "loss": 0.3486, + "step": 210500 + }, + { + "epoch": 4.9, + "learning_rate": 3.3789394269358804e-05, + "loss": 0.3564, + "step": 211000 + }, + { + "epoch": 4.92, + "learning_rate": 3.3750505553308655e-05, + "loss": 0.3484, + "step": 211500 + }, + { + "epoch": 4.93, + "learning_rate": 3.37116168372585e-05, + "loss": 0.3541, + "step": 212000 + }, + { + "epoch": 4.94, + "learning_rate": 3.367280589864045e-05, + "loss": 0.3464, + "step": 212500 + }, + { + "epoch": 4.95, + "learning_rate": 3.36339949600224e-05, + "loss": 0.35, + "step": 213000 + }, + { + "epoch": 4.96, + "learning_rate": 3.359510624397225e-05, + "loss": 0.3475, + "step": 213500 + }, + { + "epoch": 4.97, + "learning_rate": 3.35562175279221e-05, + "loss": 0.3479, + "step": 214000 + }, + { + "epoch": 4.99, + "learning_rate": 3.3517328811871954e-05, + "loss": 0.3556, + "step": 214500 + }, + { + "epoch": 5.0, + "learning_rate": 3.34784400958218e-05, + "loss": 0.3504, + "step": 215000 + }, + { + "epoch": 5.0, + "eval_bleu": 59.352, + "eval_gen_len": 15.7454, + "eval_loss": 0.7117229700088501, + "eval_runtime": 6592.0433, + "eval_samples_per_second": 13.033, + "eval_steps_per_second": 1.629, + "step": 215120 + }, + { + "epoch": 5.01, + "learning_rate": 3.343955137977165e-05, + "loss": 0.2566, + "step": 215500 + }, + { + "epoch": 5.02, + "learning_rate": 3.3400662663721495e-05, + "loss": 0.2267, + "step": 216000 + }, + { + "epoch": 5.03, + "learning_rate": 3.336177394767134e-05, + "loss": 0.2283, + "step": 216500 + }, + { + "epoch": 5.04, + "learning_rate": 3.332288523162119e-05, + "loss": 0.226, + "step": 217000 + }, + { + "epoch": 5.06, + "learning_rate": 3.328399651557104e-05, + "loss": 0.2285, + "step": 217500 + }, + { + "epoch": 5.07, + "learning_rate": 3.3245107799520894e-05, + "loss": 0.2307, + "step": 218000 + }, + { + "epoch": 5.08, + "learning_rate": 3.320621908347074e-05, + "loss": 0.2282, + "step": 218500 + }, + { + "epoch": 5.09, + "learning_rate": 3.316733036742059e-05, + "loss": 0.2302, + "step": 219000 + }, + { + "epoch": 5.1, + "learning_rate": 3.312851942880254e-05, + "loss": 0.2312, + "step": 219500 + }, + { + "epoch": 5.11, + "learning_rate": 3.308970849018449e-05, + "loss": 0.2318, + "step": 220000 + }, + { + "epoch": 5.13, + "learning_rate": 3.305089755156644e-05, + "loss": 0.2389, + "step": 220500 + }, + { + "epoch": 5.14, + "learning_rate": 3.301200883551629e-05, + "loss": 0.2371, + "step": 221000 + }, + { + "epoch": 5.15, + "learning_rate": 3.2973120119466136e-05, + "loss": 0.2396, + "step": 221500 + }, + { + "epoch": 5.16, + "learning_rate": 3.293423140341599e-05, + "loss": 0.2356, + "step": 222000 + }, + { + "epoch": 5.17, + "learning_rate": 3.289534268736584e-05, + "loss": 0.236, + "step": 222500 + }, + { + "epoch": 5.18, + "learning_rate": 3.285645397131569e-05, + "loss": 0.2381, + "step": 223000 + }, + { + "epoch": 5.19, + "learning_rate": 3.281756525526553e-05, + "loss": 0.2396, + "step": 223500 + }, + { + "epoch": 5.21, + "learning_rate": 3.277875431664748e-05, + "loss": 0.2361, + "step": 224000 + }, + { + "epoch": 5.22, + "learning_rate": 3.273986560059733e-05, + "loss": 0.2423, + "step": 224500 + }, + { + "epoch": 5.23, + "learning_rate": 3.270097688454718e-05, + "loss": 0.2407, + "step": 225000 + }, + { + "epoch": 5.24, + "learning_rate": 3.266216594592913e-05, + "loss": 0.2427, + "step": 225500 + }, + { + "epoch": 5.25, + "learning_rate": 3.262327722987898e-05, + "loss": 0.2454, + "step": 226000 + }, + { + "epoch": 5.26, + "learning_rate": 3.258438851382883e-05, + "loss": 0.2484, + "step": 226500 + }, + { + "epoch": 5.28, + "learning_rate": 3.254549979777868e-05, + "loss": 0.2467, + "step": 227000 + }, + { + "epoch": 5.29, + "learning_rate": 3.250661108172853e-05, + "loss": 0.2439, + "step": 227500 + }, + { + "epoch": 5.3, + "learning_rate": 3.2467722365678375e-05, + "loss": 0.2496, + "step": 228000 + }, + { + "epoch": 5.31, + "learning_rate": 3.2428833649628226e-05, + "loss": 0.2486, + "step": 228500 + }, + { + "epoch": 5.32, + "learning_rate": 3.238994493357808e-05, + "loss": 0.2466, + "step": 229000 + }, + { + "epoch": 5.33, + "learning_rate": 3.235105621752792e-05, + "loss": 0.2497, + "step": 229500 + }, + { + "epoch": 5.35, + "learning_rate": 3.2312167501477774e-05, + "loss": 0.2455, + "step": 230000 + }, + { + "epoch": 5.36, + "learning_rate": 3.2273278785427626e-05, + "loss": 0.2532, + "step": 230500 + }, + { + "epoch": 5.37, + "learning_rate": 3.223439006937747e-05, + "loss": 0.2454, + "step": 231000 + }, + { + "epoch": 5.38, + "learning_rate": 3.219557913075942e-05, + "loss": 0.2491, + "step": 231500 + }, + { + "epoch": 5.39, + "learning_rate": 3.2156690414709265e-05, + "loss": 0.2501, + "step": 232000 + }, + { + "epoch": 5.4, + "learning_rate": 3.2117801698659116e-05, + "loss": 0.2528, + "step": 232500 + }, + { + "epoch": 5.42, + "learning_rate": 3.207891298260897e-05, + "loss": 0.2482, + "step": 233000 + }, + { + "epoch": 5.43, + "learning_rate": 3.204002426655881e-05, + "loss": 0.2548, + "step": 233500 + }, + { + "epoch": 5.44, + "learning_rate": 3.200121332794076e-05, + "loss": 0.2565, + "step": 234000 + }, + { + "epoch": 5.45, + "learning_rate": 3.1962324611890614e-05, + "loss": 0.253, + "step": 234500 + }, + { + "epoch": 5.46, + "learning_rate": 3.1923513673272564e-05, + "loss": 0.251, + "step": 235000 + }, + { + "epoch": 5.47, + "learning_rate": 3.1884624957222415e-05, + "loss": 0.2542, + "step": 235500 + }, + { + "epoch": 5.49, + "learning_rate": 3.1845736241172267e-05, + "loss": 0.2611, + "step": 236000 + }, + { + "epoch": 5.5, + "learning_rate": 3.180684752512211e-05, + "loss": 0.2583, + "step": 236500 + }, + { + "epoch": 5.51, + "learning_rate": 3.176795880907196e-05, + "loss": 0.2577, + "step": 237000 + }, + { + "epoch": 5.52, + "learning_rate": 3.1729070093021814e-05, + "loss": 0.2555, + "step": 237500 + }, + { + "epoch": 5.53, + "learning_rate": 3.169018137697166e-05, + "loss": 0.2581, + "step": 238000 + }, + { + "epoch": 5.54, + "learning_rate": 3.165129266092151e-05, + "loss": 0.2525, + "step": 238500 + }, + { + "epoch": 5.56, + "learning_rate": 3.1612403944871355e-05, + "loss": 0.2607, + "step": 239000 + }, + { + "epoch": 5.57, + "learning_rate": 3.157351522882121e-05, + "loss": 0.2623, + "step": 239500 + }, + { + "epoch": 5.58, + "learning_rate": 3.153462651277105e-05, + "loss": 0.2644, + "step": 240000 + }, + { + "epoch": 5.59, + "learning_rate": 3.14957377967209e-05, + "loss": 0.2635, + "step": 240500 + }, + { + "epoch": 5.6, + "learning_rate": 3.1456849080670755e-05, + "loss": 0.2644, + "step": 241000 + }, + { + "epoch": 5.61, + "learning_rate": 3.14179603646206e-05, + "loss": 0.2586, + "step": 241500 + }, + { + "epoch": 5.62, + "learning_rate": 3.137907164857045e-05, + "loss": 0.2688, + "step": 242000 + }, + { + "epoch": 5.64, + "learning_rate": 3.13401829325203e-05, + "loss": 0.2592, + "step": 242500 + }, + { + "epoch": 5.65, + "learning_rate": 3.1301294216470154e-05, + "loss": 0.2627, + "step": 243000 + }, + { + "epoch": 5.66, + "learning_rate": 3.12626388327163e-05, + "loss": 0.2659, + "step": 243500 + }, + { + "epoch": 5.67, + "learning_rate": 3.122375011666615e-05, + "loss": 0.2631, + "step": 244000 + }, + { + "epoch": 5.68, + "learning_rate": 3.1184861400616e-05, + "loss": 0.2647, + "step": 244500 + }, + { + "epoch": 5.69, + "learning_rate": 3.114597268456585e-05, + "loss": 0.2662, + "step": 245000 + }, + { + "epoch": 5.71, + "learning_rate": 3.11070839685157e-05, + "loss": 0.2647, + "step": 245500 + }, + { + "epoch": 5.72, + "learning_rate": 3.106819525246555e-05, + "loss": 0.2735, + "step": 246000 + }, + { + "epoch": 5.73, + "learning_rate": 3.10293843138475e-05, + "loss": 0.27, + "step": 246500 + }, + { + "epoch": 5.74, + "learning_rate": 3.0990495597797345e-05, + "loss": 0.2696, + "step": 247000 + }, + { + "epoch": 5.75, + "learning_rate": 3.095160688174719e-05, + "loss": 0.2652, + "step": 247500 + }, + { + "epoch": 5.76, + "learning_rate": 3.091279594312914e-05, + "loss": 0.2704, + "step": 248000 + }, + { + "epoch": 5.78, + "learning_rate": 3.087390722707899e-05, + "loss": 0.2633, + "step": 248500 + }, + { + "epoch": 5.79, + "learning_rate": 3.083501851102884e-05, + "loss": 0.2696, + "step": 249000 + }, + { + "epoch": 5.8, + "learning_rate": 3.079612979497869e-05, + "loss": 0.2657, + "step": 249500 + }, + { + "epoch": 5.81, + "learning_rate": 3.075724107892854e-05, + "loss": 0.2705, + "step": 250000 + }, + { + "epoch": 5.82, + "learning_rate": 3.071835236287839e-05, + "loss": 0.2727, + "step": 250500 + }, + { + "epoch": 5.83, + "learning_rate": 3.0679463646828235e-05, + "loss": 0.2636, + "step": 251000 + }, + { + "epoch": 5.85, + "learning_rate": 3.064057493077809e-05, + "loss": 0.2772, + "step": 251500 + }, + { + "epoch": 5.86, + "learning_rate": 3.060168621472794e-05, + "loss": 0.2658, + "step": 252000 + }, + { + "epoch": 5.87, + "learning_rate": 3.056279749867779e-05, + "loss": 0.2744, + "step": 252500 + }, + { + "epoch": 5.88, + "learning_rate": 3.0523908782627634e-05, + "loss": 0.2711, + "step": 253000 + }, + { + "epoch": 5.89, + "learning_rate": 3.0485020066577486e-05, + "loss": 0.2668, + "step": 253500 + }, + { + "epoch": 5.9, + "learning_rate": 3.0446131350527334e-05, + "loss": 0.2706, + "step": 254000 + }, + { + "epoch": 5.92, + "learning_rate": 3.040724263447718e-05, + "loss": 0.2698, + "step": 254500 + }, + { + "epoch": 5.93, + "learning_rate": 3.0368353918427027e-05, + "loss": 0.2714, + "step": 255000 + }, + { + "epoch": 5.94, + "learning_rate": 3.0329542979808977e-05, + "loss": 0.2734, + "step": 255500 + }, + { + "epoch": 5.95, + "learning_rate": 3.0290654263758828e-05, + "loss": 0.2696, + "step": 256000 + }, + { + "epoch": 5.96, + "learning_rate": 3.0251765547708676e-05, + "loss": 0.2752, + "step": 256500 + }, + { + "epoch": 5.97, + "learning_rate": 3.0212876831658528e-05, + "loss": 0.2732, + "step": 257000 + }, + { + "epoch": 5.99, + "learning_rate": 3.0173988115608376e-05, + "loss": 0.2778, + "step": 257500 + }, + { + "epoch": 6.0, + "learning_rate": 3.0135177176990326e-05, + "loss": 0.2799, + "step": 258000 + }, + { + "epoch": 6.0, + "eval_bleu": 59.2034, + "eval_gen_len": 15.6702, + "eval_loss": 0.7783872485160828, + "eval_runtime": 6542.9489, + "eval_samples_per_second": 13.131, + "eval_steps_per_second": 1.641, + "step": 258144 + }, + { + "epoch": 6.0, + "step": 258144, + "total_flos": 8.950904749025133e+18, + "train_loss": 0.5174947596547979, + "train_runtime": 240022.8696, + "train_samples_per_second": 43.02, + "train_steps_per_second": 2.689 + } + ], + "logging_steps": 500, + "max_steps": 645360, + "num_input_tokens_seen": 0, + "num_train_epochs": 15, + "save_steps": 500, + "total_flos": 8.950904749025133e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..c71a9dade62e1afff1c0282f02a162933d87afd7 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64d80f6465473a02dd91018cea5b0675845684097cfa8269a7e746c79607dbe1 +size 4856