diff --git a/README.md b/README.md
index 7b95401dc46245ac339fc25059d4a56d90b4cde5..0cb113cb3ce645fcca40180da47ba9b7bc8e5f20 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,156 @@
----
-license: apache-2.0
----
+---
+license: apache-2.0
+library_name: peft
+tags:
+- generated_from_trainer
+base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
+model-index:
+- name: outputs/lora-out
+ results: []
+---
+
+
+
+[
](https://github.com/OpenAccess-AI-Collective/axolotl)
+See axolotl config
+
+axolotl version: `0.4.0`
+```yaml
+base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
+model_type: LlamaForCausalLM
+tokenizer_type: LlamaTokenizer
+
+load_in_8bit: true
+load_in_4bit: false
+strict: false
+
+datasets:
+ - path: burkelibbey/colors
+ type:
+ field_instruction: color
+ field_output: description
+ conversation: chatml
+chat_template: chatml
+
+dataset_prepared_path:
+val_set_size: 0.05
+output_dir: ./outputs/lora-out
+
+sequence_len: 4096
+sample_packing: true
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 4
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 10
+evals_per_epoch: 4
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+
+```
+
+
+
+# outputs/lora-out
+
+This model is a fine-tuned version of [TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T](https://huggingface.co/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T) on the None dataset.
+It achieves the following results on the evaluation set:
+- Loss: 1.2375
+
+## Model description
+
+More information needed
+
+## Intended uses & limitations
+
+More information needed
+
+## Training and evaluation data
+
+More information needed
+
+## Training procedure
+
+### Training hyperparameters
+
+The following hyperparameters were used during training:
+- learning_rate: 0.0002
+- train_batch_size: 2
+- eval_batch_size: 2
+- seed: 42
+- gradient_accumulation_steps: 4
+- total_train_batch_size: 8
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_steps: 10
+- num_epochs: 4
+
+### Training results
+
+| Training Loss | Epoch | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 2.7509 | 0.0204 | 1 | 2.6902 |
+| 1.8064 | 0.2653 | 13 | 1.6735 |
+| 1.5513 | 0.5306 | 26 | 1.4832 |
+| 1.482 | 0.7959 | 39 | 1.4111 |
+| 1.392 | 1.0408 | 52 | 1.3677 |
+| 1.3414 | 1.3061 | 65 | 1.3319 |
+| 1.3213 | 1.5714 | 78 | 1.3029 |
+| 1.3028 | 1.8367 | 91 | 1.2795 |
+| 1.2761 | 2.0816 | 104 | 1.2697 |
+| 1.2509 | 2.3469 | 117 | 1.2587 |
+| 1.2884 | 2.6122 | 130 | 1.2472 |
+| 1.254 | 2.8776 | 143 | 1.2410 |
+| 1.2523 | 3.1224 | 156 | 1.2403 |
+| 1.2468 | 3.3878 | 169 | 1.2385 |
+| 1.2476 | 3.6531 | 182 | 1.2370 |
+| 1.2366 | 3.9184 | 195 | 1.2375 |
+
+
+### Framework versions
+
+- PEFT 0.10.0
+- Transformers 4.40.2
+- Pytorch 2.1.2+cu118
+- Datasets 2.19.1
+- Tokenizers 0.19.1
\ No newline at end of file
diff --git a/adapter_config.json b/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..3a88eecac3a652d9c19fe6d8e555dcac4859ccd2
--- /dev/null
+++ b/adapter_config.json
@@ -0,0 +1,34 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
+ "bias": "none",
+ "fan_in_fan_out": null,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 16,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 32,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "up_proj",
+ "o_proj",
+ "down_proj",
+ "k_proj",
+ "q_proj",
+ "gate_proj",
+ "v_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/adapter_model.bin b/adapter_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6bd33a9c099c600f1f3e9c7f8d607fe50db66f7e
--- /dev/null
+++ b/adapter_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b1f20d120db2b5b6953281cb7fa6e550c36182e6da8f44b598738a5995d5be6f
+size 101036698
diff --git a/checkpoint-147/README.md b/checkpoint-147/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..78aadd3258d05c05cabc678473b6f0942cd441c7
--- /dev/null
+++ b/checkpoint-147/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.10.0
\ No newline at end of file
diff --git a/checkpoint-147/adapter_config.json b/checkpoint-147/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..3a88eecac3a652d9c19fe6d8e555dcac4859ccd2
--- /dev/null
+++ b/checkpoint-147/adapter_config.json
@@ -0,0 +1,34 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
+ "bias": "none",
+ "fan_in_fan_out": null,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 16,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 32,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "up_proj",
+ "o_proj",
+ "down_proj",
+ "k_proj",
+ "q_proj",
+ "gate_proj",
+ "v_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-147/adapter_model.safetensors b/checkpoint-147/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..152ab9385147f50b5fe7e616f8f39ac7695d27ac
--- /dev/null
+++ b/checkpoint-147/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb2ebdaf4b36ef443d056e4e52b5f0bf8223038232557b97bb7ce888df4d3c48
+size 100966336
diff --git a/checkpoint-147/optimizer.pt b/checkpoint-147/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7bb6a65695119caab2edb42b70a1b5714b780127
--- /dev/null
+++ b/checkpoint-147/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88cf550811bb96f9852bdb7a8952d49f6f0bf413e95b0759a8db28fcab406988
+size 50916644
diff --git a/checkpoint-147/rng_state.pth b/checkpoint-147/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..b7ac6c67855d700e14ab4f2e12228c273b396659
--- /dev/null
+++ b/checkpoint-147/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f4e8611d6bcf761201e741bdb2188a6ac976702d2e3f1a3ecc21fff90ea8a001
+size 14244
diff --git a/checkpoint-147/scheduler.pt b/checkpoint-147/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..097460d4c90beb69cee0fa370724efdc76d22114
--- /dev/null
+++ b/checkpoint-147/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:83c1e2e1bea1da15cd4a47196fc191277510622d916f0b4b5e8c95f3258d5825
+size 1064
diff --git a/checkpoint-147/special_tokens_map.json b/checkpoint-147/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..72ecfeeb7e14d244c936169d2ed139eeae235ef1
--- /dev/null
+++ b/checkpoint-147/special_tokens_map.json
@@ -0,0 +1,24 @@
+{
+ "bos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": "",
+ "unk_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-147/tokenizer.model b/checkpoint-147/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899
--- /dev/null
+++ b/checkpoint-147/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723
diff --git a/checkpoint-147/tokenizer_config.json b/checkpoint-147/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0da2d3613b2ca0d42b48dea2162c06006f52b837
--- /dev/null
+++ b/checkpoint-147/tokenizer_config.json
@@ -0,0 +1,45 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "add_prefix_space": true,
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "",
+ "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "",
+ "legacy": false,
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": "",
+ "padding_side": "right",
+ "sp_model_kwargs": {},
+ "spaces_between_special_tokens": false,
+ "tokenizer_class": "LlamaTokenizer",
+ "unk_token": "",
+ "use_default_system_prompt": false,
+ "use_fast": true
+}
diff --git a/checkpoint-147/trainer_state.json b/checkpoint-147/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..d0ffdde8040e0fba5f1aa6e5c1dc5908f2b58d9e
--- /dev/null
+++ b/checkpoint-147/trainer_state.json
@@ -0,0 +1,1146 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 2.9591836734693877,
+ "eval_steps": 13,
+ "global_step": 147,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.02040816326530612,
+ "grad_norm": 0.7881951332092285,
+ "learning_rate": 2e-05,
+ "loss": 2.7509,
+ "step": 1
+ },
+ {
+ "epoch": 0.02040816326530612,
+ "eval_loss": 2.6902382373809814,
+ "eval_runtime": 269.5606,
+ "eval_samples_per_second": 6.288,
+ "eval_steps_per_second": 3.146,
+ "step": 1
+ },
+ {
+ "epoch": 0.04081632653061224,
+ "grad_norm": 0.789082407951355,
+ "learning_rate": 4e-05,
+ "loss": 2.7449,
+ "step": 2
+ },
+ {
+ "epoch": 0.061224489795918366,
+ "grad_norm": 0.7354114055633545,
+ "learning_rate": 6e-05,
+ "loss": 2.7164,
+ "step": 3
+ },
+ {
+ "epoch": 0.08163265306122448,
+ "grad_norm": 0.7292255759239197,
+ "learning_rate": 8e-05,
+ "loss": 2.7174,
+ "step": 4
+ },
+ {
+ "epoch": 0.10204081632653061,
+ "grad_norm": 0.6898028254508972,
+ "learning_rate": 0.0001,
+ "loss": 2.6891,
+ "step": 5
+ },
+ {
+ "epoch": 0.12244897959183673,
+ "grad_norm": 0.6861400604248047,
+ "learning_rate": 0.00012,
+ "loss": 2.6545,
+ "step": 6
+ },
+ {
+ "epoch": 0.14285714285714285,
+ "grad_norm": 0.7510350346565247,
+ "learning_rate": 0.00014,
+ "loss": 2.5656,
+ "step": 7
+ },
+ {
+ "epoch": 0.16326530612244897,
+ "grad_norm": 0.8011165261268616,
+ "learning_rate": 0.00016,
+ "loss": 2.4519,
+ "step": 8
+ },
+ {
+ "epoch": 0.1836734693877551,
+ "grad_norm": 0.8624005317687988,
+ "learning_rate": 0.00018,
+ "loss": 2.3178,
+ "step": 9
+ },
+ {
+ "epoch": 0.20408163265306123,
+ "grad_norm": 0.8004987835884094,
+ "learning_rate": 0.0002,
+ "loss": 2.1783,
+ "step": 10
+ },
+ {
+ "epoch": 0.22448979591836735,
+ "grad_norm": 0.6362400054931641,
+ "learning_rate": 0.000199985736255971,
+ "loss": 2.0252,
+ "step": 11
+ },
+ {
+ "epoch": 0.24489795918367346,
+ "grad_norm": 0.7930936217308044,
+ "learning_rate": 0.0001999429490929718,
+ "loss": 1.8839,
+ "step": 12
+ },
+ {
+ "epoch": 0.2653061224489796,
+ "grad_norm": 0.5149843096733093,
+ "learning_rate": 0.00019987165071710527,
+ "loss": 1.8064,
+ "step": 13
+ },
+ {
+ "epoch": 0.2653061224489796,
+ "eval_loss": 1.6734941005706787,
+ "eval_runtime": 271.2615,
+ "eval_samples_per_second": 6.249,
+ "eval_steps_per_second": 3.126,
+ "step": 13
+ },
+ {
+ "epoch": 0.2857142857142857,
+ "grad_norm": 0.42121434211730957,
+ "learning_rate": 0.00019977186146800707,
+ "loss": 1.7922,
+ "step": 14
+ },
+ {
+ "epoch": 0.30612244897959184,
+ "grad_norm": 0.3523242771625519,
+ "learning_rate": 0.0001996436098130433,
+ "loss": 1.7711,
+ "step": 15
+ },
+ {
+ "epoch": 0.32653061224489793,
+ "grad_norm": 0.3384595215320587,
+ "learning_rate": 0.00019948693233918952,
+ "loss": 1.7152,
+ "step": 16
+ },
+ {
+ "epoch": 0.3469387755102041,
+ "grad_norm": 0.34942421317100525,
+ "learning_rate": 0.00019930187374259337,
+ "loss": 1.7112,
+ "step": 17
+ },
+ {
+ "epoch": 0.3673469387755102,
+ "grad_norm": 0.31712639331817627,
+ "learning_rate": 0.00019908848681582391,
+ "loss": 1.7059,
+ "step": 18
+ },
+ {
+ "epoch": 0.3877551020408163,
+ "grad_norm": 0.2875436842441559,
+ "learning_rate": 0.00019884683243281116,
+ "loss": 1.6468,
+ "step": 19
+ },
+ {
+ "epoch": 0.40816326530612246,
+ "grad_norm": 0.24433130025863647,
+ "learning_rate": 0.00019857697953148037,
+ "loss": 1.6408,
+ "step": 20
+ },
+ {
+ "epoch": 0.42857142857142855,
+ "grad_norm": 0.21414674818515778,
+ "learning_rate": 0.00019827900509408581,
+ "loss": 1.616,
+ "step": 21
+ },
+ {
+ "epoch": 0.4489795918367347,
+ "grad_norm": 0.21537622809410095,
+ "learning_rate": 0.00019795299412524945,
+ "loss": 1.609,
+ "step": 22
+ },
+ {
+ "epoch": 0.46938775510204084,
+ "grad_norm": 0.2432074397802353,
+ "learning_rate": 0.00019759903962771156,
+ "loss": 1.6066,
+ "step": 23
+ },
+ {
+ "epoch": 0.4897959183673469,
+ "grad_norm": 0.2359839379787445,
+ "learning_rate": 0.00019721724257579907,
+ "loss": 1.5851,
+ "step": 24
+ },
+ {
+ "epoch": 0.5102040816326531,
+ "grad_norm": 0.22065888345241547,
+ "learning_rate": 0.00019680771188662044,
+ "loss": 1.5739,
+ "step": 25
+ },
+ {
+ "epoch": 0.5306122448979592,
+ "grad_norm": 0.20339132845401764,
+ "learning_rate": 0.0001963705643889941,
+ "loss": 1.5513,
+ "step": 26
+ },
+ {
+ "epoch": 0.5306122448979592,
+ "eval_loss": 1.4832030534744263,
+ "eval_runtime": 271.2449,
+ "eval_samples_per_second": 6.249,
+ "eval_steps_per_second": 3.126,
+ "step": 26
+ },
+ {
+ "epoch": 0.5510204081632653,
+ "grad_norm": 0.18875224888324738,
+ "learning_rate": 0.00019590592479012023,
+ "loss": 1.5378,
+ "step": 27
+ },
+ {
+ "epoch": 0.5714285714285714,
+ "grad_norm": 0.18564417958259583,
+ "learning_rate": 0.00019541392564000488,
+ "loss": 1.5212,
+ "step": 28
+ },
+ {
+ "epoch": 0.5918367346938775,
+ "grad_norm": 0.16226942837238312,
+ "learning_rate": 0.00019489470729364692,
+ "loss": 1.5391,
+ "step": 29
+ },
+ {
+ "epoch": 0.6122448979591837,
+ "grad_norm": 0.15650039911270142,
+ "learning_rate": 0.00019434841787099803,
+ "loss": 1.511,
+ "step": 30
+ },
+ {
+ "epoch": 0.6326530612244898,
+ "grad_norm": 0.15976540744304657,
+ "learning_rate": 0.00019377521321470805,
+ "loss": 1.5119,
+ "step": 31
+ },
+ {
+ "epoch": 0.6530612244897959,
+ "grad_norm": 0.16409288346767426,
+ "learning_rate": 0.00019317525684566685,
+ "loss": 1.4909,
+ "step": 32
+ },
+ {
+ "epoch": 0.673469387755102,
+ "grad_norm": 0.15468019247055054,
+ "learning_rate": 0.00019254871991635598,
+ "loss": 1.4951,
+ "step": 33
+ },
+ {
+ "epoch": 0.6938775510204082,
+ "grad_norm": 0.1462036371231079,
+ "learning_rate": 0.00019189578116202307,
+ "loss": 1.4643,
+ "step": 34
+ },
+ {
+ "epoch": 0.7142857142857143,
+ "grad_norm": 0.1541963368654251,
+ "learning_rate": 0.00019121662684969335,
+ "loss": 1.5159,
+ "step": 35
+ },
+ {
+ "epoch": 0.7346938775510204,
+ "grad_norm": 0.14798064529895782,
+ "learning_rate": 0.00019051145072503215,
+ "loss": 1.4741,
+ "step": 36
+ },
+ {
+ "epoch": 0.7551020408163265,
+ "grad_norm": 0.13914817571640015,
+ "learning_rate": 0.00018978045395707418,
+ "loss": 1.4788,
+ "step": 37
+ },
+ {
+ "epoch": 0.7755102040816326,
+ "grad_norm": 0.15608824789524078,
+ "learning_rate": 0.00018902384508083517,
+ "loss": 1.4687,
+ "step": 38
+ },
+ {
+ "epoch": 0.7959183673469388,
+ "grad_norm": 0.14460116624832153,
+ "learning_rate": 0.00018824183993782192,
+ "loss": 1.482,
+ "step": 39
+ },
+ {
+ "epoch": 0.7959183673469388,
+ "eval_loss": 1.411073088645935,
+ "eval_runtime": 271.292,
+ "eval_samples_per_second": 6.248,
+ "eval_steps_per_second": 3.126,
+ "step": 39
+ },
+ {
+ "epoch": 0.8163265306122449,
+ "grad_norm": 0.15740551054477692,
+ "learning_rate": 0.00018743466161445823,
+ "loss": 1.4486,
+ "step": 40
+ },
+ {
+ "epoch": 0.8367346938775511,
+ "grad_norm": 0.14149661362171173,
+ "learning_rate": 0.00018660254037844388,
+ "loss": 1.4353,
+ "step": 41
+ },
+ {
+ "epoch": 0.8571428571428571,
+ "grad_norm": 0.14034292101860046,
+ "learning_rate": 0.0001857457136130651,
+ "loss": 1.4523,
+ "step": 42
+ },
+ {
+ "epoch": 0.8775510204081632,
+ "grad_norm": 0.1487722396850586,
+ "learning_rate": 0.00018486442574947511,
+ "loss": 1.4095,
+ "step": 43
+ },
+ {
+ "epoch": 0.8979591836734694,
+ "grad_norm": 0.17400234937667847,
+ "learning_rate": 0.00018395892819696389,
+ "loss": 1.4414,
+ "step": 44
+ },
+ {
+ "epoch": 0.9183673469387755,
+ "grad_norm": 0.1741325408220291,
+ "learning_rate": 0.00018302947927123766,
+ "loss": 1.4379,
+ "step": 45
+ },
+ {
+ "epoch": 0.9387755102040817,
+ "grad_norm": 0.15319454669952393,
+ "learning_rate": 0.00018207634412072764,
+ "loss": 1.405,
+ "step": 46
+ },
+ {
+ "epoch": 0.9591836734693877,
+ "grad_norm": 0.15876264870166779,
+ "learning_rate": 0.00018109979465095013,
+ "loss": 1.4122,
+ "step": 47
+ },
+ {
+ "epoch": 0.9795918367346939,
+ "grad_norm": 0.17120805382728577,
+ "learning_rate": 0.00018010010944693848,
+ "loss": 1.4132,
+ "step": 48
+ },
+ {
+ "epoch": 1.0,
+ "grad_norm": 0.1436116099357605,
+ "learning_rate": 0.00017907757369376985,
+ "loss": 1.416,
+ "step": 49
+ },
+ {
+ "epoch": 1.0204081632653061,
+ "grad_norm": 0.1707429438829422,
+ "learning_rate": 0.0001780324790952092,
+ "loss": 1.3913,
+ "step": 50
+ },
+ {
+ "epoch": 1.0204081632653061,
+ "grad_norm": 0.17117524147033691,
+ "learning_rate": 0.00017696512379049325,
+ "loss": 1.3963,
+ "step": 51
+ },
+ {
+ "epoch": 1.0408163265306123,
+ "grad_norm": 0.13410089910030365,
+ "learning_rate": 0.0001758758122692791,
+ "loss": 1.392,
+ "step": 52
+ },
+ {
+ "epoch": 1.0408163265306123,
+ "eval_loss": 1.3676769733428955,
+ "eval_runtime": 270.8566,
+ "eval_samples_per_second": 6.258,
+ "eval_steps_per_second": 3.131,
+ "step": 52
+ },
+ {
+ "epoch": 1.0612244897959184,
+ "grad_norm": 0.18877607583999634,
+ "learning_rate": 0.00017476485528478093,
+ "loss": 1.3854,
+ "step": 53
+ },
+ {
+ "epoch": 1.0816326530612246,
+ "grad_norm": 0.1752927452325821,
+ "learning_rate": 0.00017363256976511972,
+ "loss": 1.3759,
+ "step": 54
+ },
+ {
+ "epoch": 1.1020408163265305,
+ "grad_norm": 0.17180170118808746,
+ "learning_rate": 0.000172479278722912,
+ "loss": 1.3614,
+ "step": 55
+ },
+ {
+ "epoch": 1.1224489795918366,
+ "grad_norm": 0.1640290915966034,
+ "learning_rate": 0.00017130531116312203,
+ "loss": 1.3853,
+ "step": 56
+ },
+ {
+ "epoch": 1.1428571428571428,
+ "grad_norm": 0.2047068476676941,
+ "learning_rate": 0.0001701110019892053,
+ "loss": 1.3699,
+ "step": 57
+ },
+ {
+ "epoch": 1.163265306122449,
+ "grad_norm": 0.1835869997739792,
+ "learning_rate": 0.00016889669190756868,
+ "loss": 1.3403,
+ "step": 58
+ },
+ {
+ "epoch": 1.183673469387755,
+ "grad_norm": 0.16733241081237793,
+ "learning_rate": 0.00016766272733037576,
+ "loss": 1.3609,
+ "step": 59
+ },
+ {
+ "epoch": 1.2040816326530612,
+ "grad_norm": 0.178726926445961,
+ "learning_rate": 0.00016640946027672392,
+ "loss": 1.3651,
+ "step": 60
+ },
+ {
+ "epoch": 1.2244897959183674,
+ "grad_norm": 0.16719630360603333,
+ "learning_rate": 0.00016513724827222227,
+ "loss": 1.3676,
+ "step": 61
+ },
+ {
+ "epoch": 1.2448979591836735,
+ "grad_norm": 0.15999363362789154,
+ "learning_rate": 0.00016384645424699835,
+ "loss": 1.3651,
+ "step": 62
+ },
+ {
+ "epoch": 1.2653061224489797,
+ "grad_norm": 0.1705988198518753,
+ "learning_rate": 0.00016253744643216368,
+ "loss": 1.3757,
+ "step": 63
+ },
+ {
+ "epoch": 1.2857142857142856,
+ "grad_norm": 0.14996370673179626,
+ "learning_rate": 0.0001612105982547663,
+ "loss": 1.3474,
+ "step": 64
+ },
+ {
+ "epoch": 1.306122448979592,
+ "grad_norm": 0.19127260148525238,
+ "learning_rate": 0.0001598662882312615,
+ "loss": 1.3414,
+ "step": 65
+ },
+ {
+ "epoch": 1.306122448979592,
+ "eval_loss": 1.331880807876587,
+ "eval_runtime": 270.8424,
+ "eval_samples_per_second": 6.258,
+ "eval_steps_per_second": 3.131,
+ "step": 65
+ },
+ {
+ "epoch": 1.3265306122448979,
+ "grad_norm": 0.16125527024269104,
+ "learning_rate": 0.00015850489985953076,
+ "loss": 1.3509,
+ "step": 66
+ },
+ {
+ "epoch": 1.346938775510204,
+ "grad_norm": 0.1979473978281021,
+ "learning_rate": 0.00015712682150947923,
+ "loss": 1.3579,
+ "step": 67
+ },
+ {
+ "epoch": 1.3673469387755102,
+ "grad_norm": 0.18317992985248566,
+ "learning_rate": 0.00015573244631224365,
+ "loss": 1.3341,
+ "step": 68
+ },
+ {
+ "epoch": 1.3877551020408163,
+ "grad_norm": 0.1646898239850998,
+ "learning_rate": 0.0001543221720480419,
+ "loss": 1.3361,
+ "step": 69
+ },
+ {
+ "epoch": 1.4081632653061225,
+ "grad_norm": 0.1760271042585373,
+ "learning_rate": 0.00015289640103269625,
+ "loss": 1.358,
+ "step": 70
+ },
+ {
+ "epoch": 1.4285714285714286,
+ "grad_norm": 0.165283203125,
+ "learning_rate": 0.0001514555400028629,
+ "loss": 1.3072,
+ "step": 71
+ },
+ {
+ "epoch": 1.4489795918367347,
+ "grad_norm": 0.1507076472043991,
+ "learning_rate": 0.00015000000000000001,
+ "loss": 1.3133,
+ "step": 72
+ },
+ {
+ "epoch": 1.469387755102041,
+ "grad_norm": 0.16913647949695587,
+ "learning_rate": 0.00014853019625310813,
+ "loss": 1.3232,
+ "step": 73
+ },
+ {
+ "epoch": 1.489795918367347,
+ "grad_norm": 0.18266479671001434,
+ "learning_rate": 0.0001470465480602756,
+ "loss": 1.3512,
+ "step": 74
+ },
+ {
+ "epoch": 1.510204081632653,
+ "grad_norm": 0.19301828742027283,
+ "learning_rate": 0.0001455494786690634,
+ "loss": 1.3241,
+ "step": 75
+ },
+ {
+ "epoch": 1.5306122448979593,
+ "grad_norm": 0.16109652817249298,
+ "learning_rate": 0.00014403941515576344,
+ "loss": 1.3256,
+ "step": 76
+ },
+ {
+ "epoch": 1.5510204081632653,
+ "grad_norm": 0.17053867876529694,
+ "learning_rate": 0.00014251678830356408,
+ "loss": 1.3162,
+ "step": 77
+ },
+ {
+ "epoch": 1.5714285714285714,
+ "grad_norm": 0.17348544299602509,
+ "learning_rate": 0.00014098203247965875,
+ "loss": 1.3213,
+ "step": 78
+ },
+ {
+ "epoch": 1.5714285714285714,
+ "eval_loss": 1.3028697967529297,
+ "eval_runtime": 270.8095,
+ "eval_samples_per_second": 6.259,
+ "eval_steps_per_second": 3.131,
+ "step": 78
+ },
+ {
+ "epoch": 1.5918367346938775,
+ "grad_norm": 0.1703907549381256,
+ "learning_rate": 0.00013943558551133186,
+ "loss": 1.3073,
+ "step": 79
+ },
+ {
+ "epoch": 1.6122448979591837,
+ "grad_norm": 0.17313100397586823,
+ "learning_rate": 0.0001378778885610576,
+ "loss": 1.3232,
+ "step": 80
+ },
+ {
+ "epoch": 1.6326530612244898,
+ "grad_norm": 0.17237025499343872,
+ "learning_rate": 0.00013630938600064747,
+ "loss": 1.3406,
+ "step": 81
+ },
+ {
+ "epoch": 1.6530612244897958,
+ "grad_norm": 0.19658459722995758,
+ "learning_rate": 0.00013473052528448201,
+ "loss": 1.3114,
+ "step": 82
+ },
+ {
+ "epoch": 1.6734693877551021,
+ "grad_norm": 0.20599938929080963,
+ "learning_rate": 0.0001331417568218636,
+ "loss": 1.3288,
+ "step": 83
+ },
+ {
+ "epoch": 1.693877551020408,
+ "grad_norm": 0.17759399116039276,
+ "learning_rate": 0.00013154353384852558,
+ "loss": 1.2995,
+ "step": 84
+ },
+ {
+ "epoch": 1.7142857142857144,
+ "grad_norm": 0.18712250888347626,
+ "learning_rate": 0.00012993631229733582,
+ "loss": 1.2895,
+ "step": 85
+ },
+ {
+ "epoch": 1.7346938775510203,
+ "grad_norm": 0.1991330236196518,
+ "learning_rate": 0.00012832055066823038,
+ "loss": 1.2886,
+ "step": 86
+ },
+ {
+ "epoch": 1.7551020408163265,
+ "grad_norm": 0.22125203907489777,
+ "learning_rate": 0.00012669670989741517,
+ "loss": 1.3233,
+ "step": 87
+ },
+ {
+ "epoch": 1.7755102040816326,
+ "grad_norm": 0.2052813619375229,
+ "learning_rate": 0.00012506525322587207,
+ "loss": 1.3079,
+ "step": 88
+ },
+ {
+ "epoch": 1.7959183673469388,
+ "grad_norm": 0.19290736317634583,
+ "learning_rate": 0.00012342664606720822,
+ "loss": 1.3174,
+ "step": 89
+ },
+ {
+ "epoch": 1.816326530612245,
+ "grad_norm": 0.20912542939186096,
+ "learning_rate": 0.00012178135587488515,
+ "loss": 1.2915,
+ "step": 90
+ },
+ {
+ "epoch": 1.836734693877551,
+ "grad_norm": 0.20760588347911835,
+ "learning_rate": 0.00012012985200886602,
+ "loss": 1.3028,
+ "step": 91
+ },
+ {
+ "epoch": 1.836734693877551,
+ "eval_loss": 1.2795333862304688,
+ "eval_runtime": 270.6525,
+ "eval_samples_per_second": 6.263,
+ "eval_steps_per_second": 3.133,
+ "step": 91
+ },
+ {
+ "epoch": 1.8571428571428572,
+ "grad_norm": 0.1996900886297226,
+ "learning_rate": 0.00011847260560171896,
+ "loss": 1.3119,
+ "step": 92
+ },
+ {
+ "epoch": 1.8775510204081631,
+ "grad_norm": 0.23766876757144928,
+ "learning_rate": 0.00011681008942421483,
+ "loss": 1.2978,
+ "step": 93
+ },
+ {
+ "epoch": 1.8979591836734695,
+ "grad_norm": 0.19782397150993347,
+ "learning_rate": 0.00011514277775045768,
+ "loss": 1.2955,
+ "step": 94
+ },
+ {
+ "epoch": 1.9183673469387754,
+ "grad_norm": 0.22519494593143463,
+ "learning_rate": 0.00011347114622258612,
+ "loss": 1.2957,
+ "step": 95
+ },
+ {
+ "epoch": 1.9387755102040818,
+ "grad_norm": 0.2590245306491852,
+ "learning_rate": 0.00011179567171508463,
+ "loss": 1.2809,
+ "step": 96
+ },
+ {
+ "epoch": 1.9591836734693877,
+ "grad_norm": 0.2235420197248459,
+ "learning_rate": 0.00011011683219874323,
+ "loss": 1.2784,
+ "step": 97
+ },
+ {
+ "epoch": 1.9795918367346939,
+ "grad_norm": 0.285740464925766,
+ "learning_rate": 0.00010843510660430447,
+ "loss": 1.309,
+ "step": 98
+ },
+ {
+ "epoch": 2.0,
+ "grad_norm": 0.20554350316524506,
+ "learning_rate": 0.00010675097468583652,
+ "loss": 1.273,
+ "step": 99
+ },
+ {
+ "epoch": 2.020408163265306,
+ "grad_norm": 0.24468418955802917,
+ "learning_rate": 0.00010506491688387127,
+ "loss": 1.2833,
+ "step": 100
+ },
+ {
+ "epoch": 2.020408163265306,
+ "grad_norm": 0.21553528308868408,
+ "learning_rate": 0.00010337741418834684,
+ "loss": 1.2669,
+ "step": 101
+ },
+ {
+ "epoch": 2.0408163265306123,
+ "grad_norm": 0.22015659511089325,
+ "learning_rate": 0.0001016889480013931,
+ "loss": 1.2795,
+ "step": 102
+ },
+ {
+ "epoch": 2.061224489795918,
+ "grad_norm": 0.2028799206018448,
+ "learning_rate": 0.0001,
+ "loss": 1.2584,
+ "step": 103
+ },
+ {
+ "epoch": 2.0816326530612246,
+ "grad_norm": 0.23474323749542236,
+ "learning_rate": 9.83110519986069e-05,
+ "loss": 1.2761,
+ "step": 104
+ },
+ {
+ "epoch": 2.0816326530612246,
+ "eval_loss": 1.2696796655654907,
+ "eval_runtime": 270.6586,
+ "eval_samples_per_second": 6.263,
+ "eval_steps_per_second": 3.133,
+ "step": 104
+ },
+ {
+ "epoch": 2.1020408163265305,
+ "grad_norm": 0.21070216596126556,
+ "learning_rate": 9.662258581165319e-05,
+ "loss": 1.2808,
+ "step": 105
+ },
+ {
+ "epoch": 2.122448979591837,
+ "grad_norm": 0.21867221593856812,
+ "learning_rate": 9.493508311612874e-05,
+ "loss": 1.2873,
+ "step": 106
+ },
+ {
+ "epoch": 2.142857142857143,
+ "grad_norm": 0.21630822122097015,
+ "learning_rate": 9.324902531416349e-05,
+ "loss": 1.2527,
+ "step": 107
+ },
+ {
+ "epoch": 2.163265306122449,
+ "grad_norm": 0.2134082019329071,
+ "learning_rate": 9.156489339569554e-05,
+ "loss": 1.2755,
+ "step": 108
+ },
+ {
+ "epoch": 2.183673469387755,
+ "grad_norm": 0.22310714423656464,
+ "learning_rate": 8.98831678012568e-05,
+ "loss": 1.2512,
+ "step": 109
+ },
+ {
+ "epoch": 2.204081632653061,
+ "grad_norm": 0.2365124374628067,
+ "learning_rate": 8.820432828491542e-05,
+ "loss": 1.2725,
+ "step": 110
+ },
+ {
+ "epoch": 2.2244897959183674,
+ "grad_norm": 0.2086496651172638,
+ "learning_rate": 8.652885377741393e-05,
+ "loss": 1.2488,
+ "step": 111
+ },
+ {
+ "epoch": 2.2448979591836733,
+ "grad_norm": 0.20848101377487183,
+ "learning_rate": 8.485722224954237e-05,
+ "loss": 1.2793,
+ "step": 112
+ },
+ {
+ "epoch": 2.2653061224489797,
+ "grad_norm": 0.20784686505794525,
+ "learning_rate": 8.31899105757852e-05,
+ "loss": 1.2564,
+ "step": 113
+ },
+ {
+ "epoch": 2.2857142857142856,
+ "grad_norm": 0.21896174550056458,
+ "learning_rate": 8.15273943982811e-05,
+ "loss": 1.2515,
+ "step": 114
+ },
+ {
+ "epoch": 2.306122448979592,
+ "grad_norm": 0.21367855370044708,
+ "learning_rate": 7.987014799113397e-05,
+ "loss": 1.248,
+ "step": 115
+ },
+ {
+ "epoch": 2.326530612244898,
+ "grad_norm": 0.20891636610031128,
+ "learning_rate": 7.821864412511485e-05,
+ "loss": 1.2753,
+ "step": 116
+ },
+ {
+ "epoch": 2.3469387755102042,
+ "grad_norm": 0.2092975378036499,
+ "learning_rate": 7.65733539327918e-05,
+ "loss": 1.2509,
+ "step": 117
+ },
+ {
+ "epoch": 2.3469387755102042,
+ "eval_loss": 1.258699655532837,
+ "eval_runtime": 270.5384,
+ "eval_samples_per_second": 6.265,
+ "eval_steps_per_second": 3.134,
+ "step": 117
+ },
+ {
+ "epoch": 2.36734693877551,
+ "grad_norm": 0.1905972808599472,
+ "learning_rate": 7.493474677412794e-05,
+ "loss": 1.2516,
+ "step": 118
+ },
+ {
+ "epoch": 2.387755102040816,
+ "grad_norm": 0.19716158509254456,
+ "learning_rate": 7.330329010258483e-05,
+ "loss": 1.2665,
+ "step": 119
+ },
+ {
+ "epoch": 2.4081632653061225,
+ "grad_norm": 0.1953389048576355,
+ "learning_rate": 7.16794493317696e-05,
+ "loss": 1.2661,
+ "step": 120
+ },
+ {
+ "epoch": 2.4285714285714284,
+ "grad_norm": 0.1990067958831787,
+ "learning_rate": 7.006368770266421e-05,
+ "loss": 1.2619,
+ "step": 121
+ },
+ {
+ "epoch": 2.4489795918367347,
+ "grad_norm": 0.1954919546842575,
+ "learning_rate": 6.845646615147445e-05,
+ "loss": 1.2736,
+ "step": 122
+ },
+ {
+ "epoch": 2.4693877551020407,
+ "grad_norm": 0.18382853269577026,
+ "learning_rate": 6.685824317813643e-05,
+ "loss": 1.2732,
+ "step": 123
+ },
+ {
+ "epoch": 2.489795918367347,
+ "grad_norm": 0.18729491531848907,
+ "learning_rate": 6.526947471551798e-05,
+ "loss": 1.2509,
+ "step": 124
+ },
+ {
+ "epoch": 2.510204081632653,
+ "grad_norm": 0.2034740000963211,
+ "learning_rate": 6.369061399935255e-05,
+ "loss": 1.2829,
+ "step": 125
+ },
+ {
+ "epoch": 2.5306122448979593,
+ "grad_norm": 0.1952620893716812,
+ "learning_rate": 6.21221114389424e-05,
+ "loss": 1.2689,
+ "step": 126
+ },
+ {
+ "epoch": 2.5510204081632653,
+ "grad_norm": 0.1986168622970581,
+ "learning_rate": 6.0564414488668165e-05,
+ "loss": 1.2644,
+ "step": 127
+ },
+ {
+ "epoch": 2.571428571428571,
+ "grad_norm": 0.19526751339435577,
+ "learning_rate": 5.901796752034128e-05,
+ "loss": 1.265,
+ "step": 128
+ },
+ {
+ "epoch": 2.5918367346938775,
+ "grad_norm": 0.195367693901062,
+ "learning_rate": 5.748321169643596e-05,
+ "loss": 1.2782,
+ "step": 129
+ },
+ {
+ "epoch": 2.612244897959184,
+ "grad_norm": 0.18351928889751434,
+ "learning_rate": 5.596058484423656e-05,
+ "loss": 1.2884,
+ "step": 130
+ },
+ {
+ "epoch": 2.612244897959184,
+ "eval_loss": 1.2471545934677124,
+ "eval_runtime": 270.4953,
+ "eval_samples_per_second": 6.266,
+ "eval_steps_per_second": 3.135,
+ "step": 130
+ },
+ {
+ "epoch": 2.63265306122449,
+ "grad_norm": 0.2015760987997055,
+ "learning_rate": 5.44505213309366e-05,
+ "loss": 1.2536,
+ "step": 131
+ },
+ {
+ "epoch": 2.6530612244897958,
+ "grad_norm": 0.1734190732240677,
+ "learning_rate": 5.2953451939724454e-05,
+ "loss": 1.2628,
+ "step": 132
+ },
+ {
+ "epoch": 2.673469387755102,
+ "grad_norm": 0.214066281914711,
+ "learning_rate": 5.146980374689192e-05,
+ "loss": 1.2543,
+ "step": 133
+ },
+ {
+ "epoch": 2.693877551020408,
+ "grad_norm": 0.17507924139499664,
+ "learning_rate": 5.000000000000002e-05,
+ "loss": 1.2665,
+ "step": 134
+ },
+ {
+ "epoch": 2.7142857142857144,
+ "grad_norm": 0.1778109222650528,
+ "learning_rate": 4.854445999713715e-05,
+ "loss": 1.2789,
+ "step": 135
+ },
+ {
+ "epoch": 2.7346938775510203,
+ "grad_norm": 0.1856827288866043,
+ "learning_rate": 4.710359896730379e-05,
+ "loss": 1.2481,
+ "step": 136
+ },
+ {
+ "epoch": 2.7551020408163263,
+ "grad_norm": 0.17856694757938385,
+ "learning_rate": 4.567782795195816e-05,
+ "loss": 1.2732,
+ "step": 137
+ },
+ {
+ "epoch": 2.7755102040816326,
+ "grad_norm": 0.21598489582538605,
+ "learning_rate": 4.426755368775637e-05,
+ "loss": 1.2525,
+ "step": 138
+ },
+ {
+ "epoch": 2.795918367346939,
+ "grad_norm": 0.17308436334133148,
+ "learning_rate": 4.287317849052075e-05,
+ "loss": 1.2665,
+ "step": 139
+ },
+ {
+ "epoch": 2.816326530612245,
+ "grad_norm": 0.19207212328910828,
+ "learning_rate": 4.149510014046922e-05,
+ "loss": 1.2681,
+ "step": 140
+ },
+ {
+ "epoch": 2.836734693877551,
+ "grad_norm": 0.19626958668231964,
+ "learning_rate": 4.013371176873849e-05,
+ "loss": 1.2727,
+ "step": 141
+ },
+ {
+ "epoch": 2.857142857142857,
+ "grad_norm": 0.1986483484506607,
+ "learning_rate": 3.878940174523371e-05,
+ "loss": 1.2414,
+ "step": 142
+ },
+ {
+ "epoch": 2.877551020408163,
+ "grad_norm": 0.19369089603424072,
+ "learning_rate": 3.746255356783632e-05,
+ "loss": 1.254,
+ "step": 143
+ },
+ {
+ "epoch": 2.877551020408163,
+ "eval_loss": 1.2410293817520142,
+ "eval_runtime": 270.6762,
+ "eval_samples_per_second": 6.262,
+ "eval_steps_per_second": 3.133,
+ "step": 143
+ },
+ {
+ "epoch": 2.8979591836734695,
+ "grad_norm": 0.20910531282424927,
+ "learning_rate": 3.615354575300166e-05,
+ "loss": 1.2541,
+ "step": 144
+ },
+ {
+ "epoch": 2.9183673469387754,
+ "grad_norm": 0.19536806643009186,
+ "learning_rate": 3.4862751727777797e-05,
+ "loss": 1.2517,
+ "step": 145
+ },
+ {
+ "epoch": 2.938775510204082,
+ "grad_norm": 0.18630966544151306,
+ "learning_rate": 3.3590539723276083e-05,
+ "loss": 1.2473,
+ "step": 146
+ },
+ {
+ "epoch": 2.9591836734693877,
+ "grad_norm": 0.1874723732471466,
+ "learning_rate": 3.233727266962425e-05,
+ "loss": 1.244,
+ "step": 147
+ }
+ ],
+ "logging_steps": 1,
+ "max_steps": 196,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 4,
+ "save_steps": 49,
+ "total_flos": 3.0628052408991744e+16,
+ "train_batch_size": 2,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-147/training_args.bin b/checkpoint-147/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..be517173d48b34e38c41c52f74f7e02495562a61
--- /dev/null
+++ b/checkpoint-147/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a9cc4ff61a4799ac22d5c627a0169a20cc75619ae1b5871f2d114f95284d87a6
+size 5816
diff --git a/checkpoint-196/README.md b/checkpoint-196/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..78aadd3258d05c05cabc678473b6f0942cd441c7
--- /dev/null
+++ b/checkpoint-196/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.10.0
\ No newline at end of file
diff --git a/checkpoint-196/adapter_config.json b/checkpoint-196/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..3a88eecac3a652d9c19fe6d8e555dcac4859ccd2
--- /dev/null
+++ b/checkpoint-196/adapter_config.json
@@ -0,0 +1,34 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
+ "bias": "none",
+ "fan_in_fan_out": null,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 16,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 32,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "up_proj",
+ "o_proj",
+ "down_proj",
+ "k_proj",
+ "q_proj",
+ "gate_proj",
+ "v_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-196/adapter_model.safetensors b/checkpoint-196/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d41bc9e3dcdf277e604cca1cf5b2badf5137c297
--- /dev/null
+++ b/checkpoint-196/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c7149dfd1479c35b75fc75c4e9be3785070da91bd7c29d040e9a259ea5111014
+size 100966336
diff --git a/checkpoint-196/optimizer.pt b/checkpoint-196/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..2b4f86376842744d04c564484c1f9305e286115e
--- /dev/null
+++ b/checkpoint-196/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:96f2429392a17aa7909b16091d5a0b62592f80090a1a9943b203b1e1c29e66f8
+size 50916644
diff --git a/checkpoint-196/rng_state.pth b/checkpoint-196/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..9366e994ff6abae6251829869eedce66f55dc840
--- /dev/null
+++ b/checkpoint-196/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a160c2864b63ef158843056f3ba263b2da60c6bef707459f056731cde2e27043
+size 14244
diff --git a/checkpoint-196/scheduler.pt b/checkpoint-196/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..74c904cc3f97ac3da7ec72be923894928f8e70c1
--- /dev/null
+++ b/checkpoint-196/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e22ca0a50bab80d00c8b8910bffb983a348f8762b7cf025e6f8e64a05a938289
+size 1064
diff --git a/checkpoint-196/special_tokens_map.json b/checkpoint-196/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..72ecfeeb7e14d244c936169d2ed139eeae235ef1
--- /dev/null
+++ b/checkpoint-196/special_tokens_map.json
@@ -0,0 +1,24 @@
+{
+ "bos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": "",
+ "unk_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-196/tokenizer.model b/checkpoint-196/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899
--- /dev/null
+++ b/checkpoint-196/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723
diff --git a/checkpoint-196/tokenizer_config.json b/checkpoint-196/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0da2d3613b2ca0d42b48dea2162c06006f52b837
--- /dev/null
+++ b/checkpoint-196/tokenizer_config.json
@@ -0,0 +1,45 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "add_prefix_space": true,
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "",
+ "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "",
+ "legacy": false,
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": "",
+ "padding_side": "right",
+ "sp_model_kwargs": {},
+ "spaces_between_special_tokens": false,
+ "tokenizer_class": "LlamaTokenizer",
+ "unk_token": "",
+ "use_default_system_prompt": false,
+ "use_fast": true
+}
diff --git a/checkpoint-196/trainer_state.json b/checkpoint-196/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..f2a21907b42052c3b124b28477dd1724df48ffb8
--- /dev/null
+++ b/checkpoint-196/trainer_state.json
@@ -0,0 +1,1521 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 3.938775510204082,
+ "eval_steps": 13,
+ "global_step": 196,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.02040816326530612,
+ "grad_norm": 0.7881951332092285,
+ "learning_rate": 2e-05,
+ "loss": 2.7509,
+ "step": 1
+ },
+ {
+ "epoch": 0.02040816326530612,
+ "eval_loss": 2.6902382373809814,
+ "eval_runtime": 269.5606,
+ "eval_samples_per_second": 6.288,
+ "eval_steps_per_second": 3.146,
+ "step": 1
+ },
+ {
+ "epoch": 0.04081632653061224,
+ "grad_norm": 0.789082407951355,
+ "learning_rate": 4e-05,
+ "loss": 2.7449,
+ "step": 2
+ },
+ {
+ "epoch": 0.061224489795918366,
+ "grad_norm": 0.7354114055633545,
+ "learning_rate": 6e-05,
+ "loss": 2.7164,
+ "step": 3
+ },
+ {
+ "epoch": 0.08163265306122448,
+ "grad_norm": 0.7292255759239197,
+ "learning_rate": 8e-05,
+ "loss": 2.7174,
+ "step": 4
+ },
+ {
+ "epoch": 0.10204081632653061,
+ "grad_norm": 0.6898028254508972,
+ "learning_rate": 0.0001,
+ "loss": 2.6891,
+ "step": 5
+ },
+ {
+ "epoch": 0.12244897959183673,
+ "grad_norm": 0.6861400604248047,
+ "learning_rate": 0.00012,
+ "loss": 2.6545,
+ "step": 6
+ },
+ {
+ "epoch": 0.14285714285714285,
+ "grad_norm": 0.7510350346565247,
+ "learning_rate": 0.00014,
+ "loss": 2.5656,
+ "step": 7
+ },
+ {
+ "epoch": 0.16326530612244897,
+ "grad_norm": 0.8011165261268616,
+ "learning_rate": 0.00016,
+ "loss": 2.4519,
+ "step": 8
+ },
+ {
+ "epoch": 0.1836734693877551,
+ "grad_norm": 0.8624005317687988,
+ "learning_rate": 0.00018,
+ "loss": 2.3178,
+ "step": 9
+ },
+ {
+ "epoch": 0.20408163265306123,
+ "grad_norm": 0.8004987835884094,
+ "learning_rate": 0.0002,
+ "loss": 2.1783,
+ "step": 10
+ },
+ {
+ "epoch": 0.22448979591836735,
+ "grad_norm": 0.6362400054931641,
+ "learning_rate": 0.000199985736255971,
+ "loss": 2.0252,
+ "step": 11
+ },
+ {
+ "epoch": 0.24489795918367346,
+ "grad_norm": 0.7930936217308044,
+ "learning_rate": 0.0001999429490929718,
+ "loss": 1.8839,
+ "step": 12
+ },
+ {
+ "epoch": 0.2653061224489796,
+ "grad_norm": 0.5149843096733093,
+ "learning_rate": 0.00019987165071710527,
+ "loss": 1.8064,
+ "step": 13
+ },
+ {
+ "epoch": 0.2653061224489796,
+ "eval_loss": 1.6734941005706787,
+ "eval_runtime": 271.2615,
+ "eval_samples_per_second": 6.249,
+ "eval_steps_per_second": 3.126,
+ "step": 13
+ },
+ {
+ "epoch": 0.2857142857142857,
+ "grad_norm": 0.42121434211730957,
+ "learning_rate": 0.00019977186146800707,
+ "loss": 1.7922,
+ "step": 14
+ },
+ {
+ "epoch": 0.30612244897959184,
+ "grad_norm": 0.3523242771625519,
+ "learning_rate": 0.0001996436098130433,
+ "loss": 1.7711,
+ "step": 15
+ },
+ {
+ "epoch": 0.32653061224489793,
+ "grad_norm": 0.3384595215320587,
+ "learning_rate": 0.00019948693233918952,
+ "loss": 1.7152,
+ "step": 16
+ },
+ {
+ "epoch": 0.3469387755102041,
+ "grad_norm": 0.34942421317100525,
+ "learning_rate": 0.00019930187374259337,
+ "loss": 1.7112,
+ "step": 17
+ },
+ {
+ "epoch": 0.3673469387755102,
+ "grad_norm": 0.31712639331817627,
+ "learning_rate": 0.00019908848681582391,
+ "loss": 1.7059,
+ "step": 18
+ },
+ {
+ "epoch": 0.3877551020408163,
+ "grad_norm": 0.2875436842441559,
+ "learning_rate": 0.00019884683243281116,
+ "loss": 1.6468,
+ "step": 19
+ },
+ {
+ "epoch": 0.40816326530612246,
+ "grad_norm": 0.24433130025863647,
+ "learning_rate": 0.00019857697953148037,
+ "loss": 1.6408,
+ "step": 20
+ },
+ {
+ "epoch": 0.42857142857142855,
+ "grad_norm": 0.21414674818515778,
+ "learning_rate": 0.00019827900509408581,
+ "loss": 1.616,
+ "step": 21
+ },
+ {
+ "epoch": 0.4489795918367347,
+ "grad_norm": 0.21537622809410095,
+ "learning_rate": 0.00019795299412524945,
+ "loss": 1.609,
+ "step": 22
+ },
+ {
+ "epoch": 0.46938775510204084,
+ "grad_norm": 0.2432074397802353,
+ "learning_rate": 0.00019759903962771156,
+ "loss": 1.6066,
+ "step": 23
+ },
+ {
+ "epoch": 0.4897959183673469,
+ "grad_norm": 0.2359839379787445,
+ "learning_rate": 0.00019721724257579907,
+ "loss": 1.5851,
+ "step": 24
+ },
+ {
+ "epoch": 0.5102040816326531,
+ "grad_norm": 0.22065888345241547,
+ "learning_rate": 0.00019680771188662044,
+ "loss": 1.5739,
+ "step": 25
+ },
+ {
+ "epoch": 0.5306122448979592,
+ "grad_norm": 0.20339132845401764,
+ "learning_rate": 0.0001963705643889941,
+ "loss": 1.5513,
+ "step": 26
+ },
+ {
+ "epoch": 0.5306122448979592,
+ "eval_loss": 1.4832030534744263,
+ "eval_runtime": 271.2449,
+ "eval_samples_per_second": 6.249,
+ "eval_steps_per_second": 3.126,
+ "step": 26
+ },
+ {
+ "epoch": 0.5510204081632653,
+ "grad_norm": 0.18875224888324738,
+ "learning_rate": 0.00019590592479012023,
+ "loss": 1.5378,
+ "step": 27
+ },
+ {
+ "epoch": 0.5714285714285714,
+ "grad_norm": 0.18564417958259583,
+ "learning_rate": 0.00019541392564000488,
+ "loss": 1.5212,
+ "step": 28
+ },
+ {
+ "epoch": 0.5918367346938775,
+ "grad_norm": 0.16226942837238312,
+ "learning_rate": 0.00019489470729364692,
+ "loss": 1.5391,
+ "step": 29
+ },
+ {
+ "epoch": 0.6122448979591837,
+ "grad_norm": 0.15650039911270142,
+ "learning_rate": 0.00019434841787099803,
+ "loss": 1.511,
+ "step": 30
+ },
+ {
+ "epoch": 0.6326530612244898,
+ "grad_norm": 0.15976540744304657,
+ "learning_rate": 0.00019377521321470805,
+ "loss": 1.5119,
+ "step": 31
+ },
+ {
+ "epoch": 0.6530612244897959,
+ "grad_norm": 0.16409288346767426,
+ "learning_rate": 0.00019317525684566685,
+ "loss": 1.4909,
+ "step": 32
+ },
+ {
+ "epoch": 0.673469387755102,
+ "grad_norm": 0.15468019247055054,
+ "learning_rate": 0.00019254871991635598,
+ "loss": 1.4951,
+ "step": 33
+ },
+ {
+ "epoch": 0.6938775510204082,
+ "grad_norm": 0.1462036371231079,
+ "learning_rate": 0.00019189578116202307,
+ "loss": 1.4643,
+ "step": 34
+ },
+ {
+ "epoch": 0.7142857142857143,
+ "grad_norm": 0.1541963368654251,
+ "learning_rate": 0.00019121662684969335,
+ "loss": 1.5159,
+ "step": 35
+ },
+ {
+ "epoch": 0.7346938775510204,
+ "grad_norm": 0.14798064529895782,
+ "learning_rate": 0.00019051145072503215,
+ "loss": 1.4741,
+ "step": 36
+ },
+ {
+ "epoch": 0.7551020408163265,
+ "grad_norm": 0.13914817571640015,
+ "learning_rate": 0.00018978045395707418,
+ "loss": 1.4788,
+ "step": 37
+ },
+ {
+ "epoch": 0.7755102040816326,
+ "grad_norm": 0.15608824789524078,
+ "learning_rate": 0.00018902384508083517,
+ "loss": 1.4687,
+ "step": 38
+ },
+ {
+ "epoch": 0.7959183673469388,
+ "grad_norm": 0.14460116624832153,
+ "learning_rate": 0.00018824183993782192,
+ "loss": 1.482,
+ "step": 39
+ },
+ {
+ "epoch": 0.7959183673469388,
+ "eval_loss": 1.411073088645935,
+ "eval_runtime": 271.292,
+ "eval_samples_per_second": 6.248,
+ "eval_steps_per_second": 3.126,
+ "step": 39
+ },
+ {
+ "epoch": 0.8163265306122449,
+ "grad_norm": 0.15740551054477692,
+ "learning_rate": 0.00018743466161445823,
+ "loss": 1.4486,
+ "step": 40
+ },
+ {
+ "epoch": 0.8367346938775511,
+ "grad_norm": 0.14149661362171173,
+ "learning_rate": 0.00018660254037844388,
+ "loss": 1.4353,
+ "step": 41
+ },
+ {
+ "epoch": 0.8571428571428571,
+ "grad_norm": 0.14034292101860046,
+ "learning_rate": 0.0001857457136130651,
+ "loss": 1.4523,
+ "step": 42
+ },
+ {
+ "epoch": 0.8775510204081632,
+ "grad_norm": 0.1487722396850586,
+ "learning_rate": 0.00018486442574947511,
+ "loss": 1.4095,
+ "step": 43
+ },
+ {
+ "epoch": 0.8979591836734694,
+ "grad_norm": 0.17400234937667847,
+ "learning_rate": 0.00018395892819696389,
+ "loss": 1.4414,
+ "step": 44
+ },
+ {
+ "epoch": 0.9183673469387755,
+ "grad_norm": 0.1741325408220291,
+ "learning_rate": 0.00018302947927123766,
+ "loss": 1.4379,
+ "step": 45
+ },
+ {
+ "epoch": 0.9387755102040817,
+ "grad_norm": 0.15319454669952393,
+ "learning_rate": 0.00018207634412072764,
+ "loss": 1.405,
+ "step": 46
+ },
+ {
+ "epoch": 0.9591836734693877,
+ "grad_norm": 0.15876264870166779,
+ "learning_rate": 0.00018109979465095013,
+ "loss": 1.4122,
+ "step": 47
+ },
+ {
+ "epoch": 0.9795918367346939,
+ "grad_norm": 0.17120805382728577,
+ "learning_rate": 0.00018010010944693848,
+ "loss": 1.4132,
+ "step": 48
+ },
+ {
+ "epoch": 1.0,
+ "grad_norm": 0.1436116099357605,
+ "learning_rate": 0.00017907757369376985,
+ "loss": 1.416,
+ "step": 49
+ },
+ {
+ "epoch": 1.0204081632653061,
+ "grad_norm": 0.1707429438829422,
+ "learning_rate": 0.0001780324790952092,
+ "loss": 1.3913,
+ "step": 50
+ },
+ {
+ "epoch": 1.0204081632653061,
+ "grad_norm": 0.17117524147033691,
+ "learning_rate": 0.00017696512379049325,
+ "loss": 1.3963,
+ "step": 51
+ },
+ {
+ "epoch": 1.0408163265306123,
+ "grad_norm": 0.13410089910030365,
+ "learning_rate": 0.0001758758122692791,
+ "loss": 1.392,
+ "step": 52
+ },
+ {
+ "epoch": 1.0408163265306123,
+ "eval_loss": 1.3676769733428955,
+ "eval_runtime": 270.8566,
+ "eval_samples_per_second": 6.258,
+ "eval_steps_per_second": 3.131,
+ "step": 52
+ },
+ {
+ "epoch": 1.0612244897959184,
+ "grad_norm": 0.18877607583999634,
+ "learning_rate": 0.00017476485528478093,
+ "loss": 1.3854,
+ "step": 53
+ },
+ {
+ "epoch": 1.0816326530612246,
+ "grad_norm": 0.1752927452325821,
+ "learning_rate": 0.00017363256976511972,
+ "loss": 1.3759,
+ "step": 54
+ },
+ {
+ "epoch": 1.1020408163265305,
+ "grad_norm": 0.17180170118808746,
+ "learning_rate": 0.000172479278722912,
+ "loss": 1.3614,
+ "step": 55
+ },
+ {
+ "epoch": 1.1224489795918366,
+ "grad_norm": 0.1640290915966034,
+ "learning_rate": 0.00017130531116312203,
+ "loss": 1.3853,
+ "step": 56
+ },
+ {
+ "epoch": 1.1428571428571428,
+ "grad_norm": 0.2047068476676941,
+ "learning_rate": 0.0001701110019892053,
+ "loss": 1.3699,
+ "step": 57
+ },
+ {
+ "epoch": 1.163265306122449,
+ "grad_norm": 0.1835869997739792,
+ "learning_rate": 0.00016889669190756868,
+ "loss": 1.3403,
+ "step": 58
+ },
+ {
+ "epoch": 1.183673469387755,
+ "grad_norm": 0.16733241081237793,
+ "learning_rate": 0.00016766272733037576,
+ "loss": 1.3609,
+ "step": 59
+ },
+ {
+ "epoch": 1.2040816326530612,
+ "grad_norm": 0.178726926445961,
+ "learning_rate": 0.00016640946027672392,
+ "loss": 1.3651,
+ "step": 60
+ },
+ {
+ "epoch": 1.2244897959183674,
+ "grad_norm": 0.16719630360603333,
+ "learning_rate": 0.00016513724827222227,
+ "loss": 1.3676,
+ "step": 61
+ },
+ {
+ "epoch": 1.2448979591836735,
+ "grad_norm": 0.15999363362789154,
+ "learning_rate": 0.00016384645424699835,
+ "loss": 1.3651,
+ "step": 62
+ },
+ {
+ "epoch": 1.2653061224489797,
+ "grad_norm": 0.1705988198518753,
+ "learning_rate": 0.00016253744643216368,
+ "loss": 1.3757,
+ "step": 63
+ },
+ {
+ "epoch": 1.2857142857142856,
+ "grad_norm": 0.14996370673179626,
+ "learning_rate": 0.0001612105982547663,
+ "loss": 1.3474,
+ "step": 64
+ },
+ {
+ "epoch": 1.306122448979592,
+ "grad_norm": 0.19127260148525238,
+ "learning_rate": 0.0001598662882312615,
+ "loss": 1.3414,
+ "step": 65
+ },
+ {
+ "epoch": 1.306122448979592,
+ "eval_loss": 1.331880807876587,
+ "eval_runtime": 270.8424,
+ "eval_samples_per_second": 6.258,
+ "eval_steps_per_second": 3.131,
+ "step": 65
+ },
+ {
+ "epoch": 1.3265306122448979,
+ "grad_norm": 0.16125527024269104,
+ "learning_rate": 0.00015850489985953076,
+ "loss": 1.3509,
+ "step": 66
+ },
+ {
+ "epoch": 1.346938775510204,
+ "grad_norm": 0.1979473978281021,
+ "learning_rate": 0.00015712682150947923,
+ "loss": 1.3579,
+ "step": 67
+ },
+ {
+ "epoch": 1.3673469387755102,
+ "grad_norm": 0.18317992985248566,
+ "learning_rate": 0.00015573244631224365,
+ "loss": 1.3341,
+ "step": 68
+ },
+ {
+ "epoch": 1.3877551020408163,
+ "grad_norm": 0.1646898239850998,
+ "learning_rate": 0.0001543221720480419,
+ "loss": 1.3361,
+ "step": 69
+ },
+ {
+ "epoch": 1.4081632653061225,
+ "grad_norm": 0.1760271042585373,
+ "learning_rate": 0.00015289640103269625,
+ "loss": 1.358,
+ "step": 70
+ },
+ {
+ "epoch": 1.4285714285714286,
+ "grad_norm": 0.165283203125,
+ "learning_rate": 0.0001514555400028629,
+ "loss": 1.3072,
+ "step": 71
+ },
+ {
+ "epoch": 1.4489795918367347,
+ "grad_norm": 0.1507076472043991,
+ "learning_rate": 0.00015000000000000001,
+ "loss": 1.3133,
+ "step": 72
+ },
+ {
+ "epoch": 1.469387755102041,
+ "grad_norm": 0.16913647949695587,
+ "learning_rate": 0.00014853019625310813,
+ "loss": 1.3232,
+ "step": 73
+ },
+ {
+ "epoch": 1.489795918367347,
+ "grad_norm": 0.18266479671001434,
+ "learning_rate": 0.0001470465480602756,
+ "loss": 1.3512,
+ "step": 74
+ },
+ {
+ "epoch": 1.510204081632653,
+ "grad_norm": 0.19301828742027283,
+ "learning_rate": 0.0001455494786690634,
+ "loss": 1.3241,
+ "step": 75
+ },
+ {
+ "epoch": 1.5306122448979593,
+ "grad_norm": 0.16109652817249298,
+ "learning_rate": 0.00014403941515576344,
+ "loss": 1.3256,
+ "step": 76
+ },
+ {
+ "epoch": 1.5510204081632653,
+ "grad_norm": 0.17053867876529694,
+ "learning_rate": 0.00014251678830356408,
+ "loss": 1.3162,
+ "step": 77
+ },
+ {
+ "epoch": 1.5714285714285714,
+ "grad_norm": 0.17348544299602509,
+ "learning_rate": 0.00014098203247965875,
+ "loss": 1.3213,
+ "step": 78
+ },
+ {
+ "epoch": 1.5714285714285714,
+ "eval_loss": 1.3028697967529297,
+ "eval_runtime": 270.8095,
+ "eval_samples_per_second": 6.259,
+ "eval_steps_per_second": 3.131,
+ "step": 78
+ },
+ {
+ "epoch": 1.5918367346938775,
+ "grad_norm": 0.1703907549381256,
+ "learning_rate": 0.00013943558551133186,
+ "loss": 1.3073,
+ "step": 79
+ },
+ {
+ "epoch": 1.6122448979591837,
+ "grad_norm": 0.17313100397586823,
+ "learning_rate": 0.0001378778885610576,
+ "loss": 1.3232,
+ "step": 80
+ },
+ {
+ "epoch": 1.6326530612244898,
+ "grad_norm": 0.17237025499343872,
+ "learning_rate": 0.00013630938600064747,
+ "loss": 1.3406,
+ "step": 81
+ },
+ {
+ "epoch": 1.6530612244897958,
+ "grad_norm": 0.19658459722995758,
+ "learning_rate": 0.00013473052528448201,
+ "loss": 1.3114,
+ "step": 82
+ },
+ {
+ "epoch": 1.6734693877551021,
+ "grad_norm": 0.20599938929080963,
+ "learning_rate": 0.0001331417568218636,
+ "loss": 1.3288,
+ "step": 83
+ },
+ {
+ "epoch": 1.693877551020408,
+ "grad_norm": 0.17759399116039276,
+ "learning_rate": 0.00013154353384852558,
+ "loss": 1.2995,
+ "step": 84
+ },
+ {
+ "epoch": 1.7142857142857144,
+ "grad_norm": 0.18712250888347626,
+ "learning_rate": 0.00012993631229733582,
+ "loss": 1.2895,
+ "step": 85
+ },
+ {
+ "epoch": 1.7346938775510203,
+ "grad_norm": 0.1991330236196518,
+ "learning_rate": 0.00012832055066823038,
+ "loss": 1.2886,
+ "step": 86
+ },
+ {
+ "epoch": 1.7551020408163265,
+ "grad_norm": 0.22125203907489777,
+ "learning_rate": 0.00012669670989741517,
+ "loss": 1.3233,
+ "step": 87
+ },
+ {
+ "epoch": 1.7755102040816326,
+ "grad_norm": 0.2052813619375229,
+ "learning_rate": 0.00012506525322587207,
+ "loss": 1.3079,
+ "step": 88
+ },
+ {
+ "epoch": 1.7959183673469388,
+ "grad_norm": 0.19290736317634583,
+ "learning_rate": 0.00012342664606720822,
+ "loss": 1.3174,
+ "step": 89
+ },
+ {
+ "epoch": 1.816326530612245,
+ "grad_norm": 0.20912542939186096,
+ "learning_rate": 0.00012178135587488515,
+ "loss": 1.2915,
+ "step": 90
+ },
+ {
+ "epoch": 1.836734693877551,
+ "grad_norm": 0.20760588347911835,
+ "learning_rate": 0.00012012985200886602,
+ "loss": 1.3028,
+ "step": 91
+ },
+ {
+ "epoch": 1.836734693877551,
+ "eval_loss": 1.2795333862304688,
+ "eval_runtime": 270.6525,
+ "eval_samples_per_second": 6.263,
+ "eval_steps_per_second": 3.133,
+ "step": 91
+ },
+ {
+ "epoch": 1.8571428571428572,
+ "grad_norm": 0.1996900886297226,
+ "learning_rate": 0.00011847260560171896,
+ "loss": 1.3119,
+ "step": 92
+ },
+ {
+ "epoch": 1.8775510204081631,
+ "grad_norm": 0.23766876757144928,
+ "learning_rate": 0.00011681008942421483,
+ "loss": 1.2978,
+ "step": 93
+ },
+ {
+ "epoch": 1.8979591836734695,
+ "grad_norm": 0.19782397150993347,
+ "learning_rate": 0.00011514277775045768,
+ "loss": 1.2955,
+ "step": 94
+ },
+ {
+ "epoch": 1.9183673469387754,
+ "grad_norm": 0.22519494593143463,
+ "learning_rate": 0.00011347114622258612,
+ "loss": 1.2957,
+ "step": 95
+ },
+ {
+ "epoch": 1.9387755102040818,
+ "grad_norm": 0.2590245306491852,
+ "learning_rate": 0.00011179567171508463,
+ "loss": 1.2809,
+ "step": 96
+ },
+ {
+ "epoch": 1.9591836734693877,
+ "grad_norm": 0.2235420197248459,
+ "learning_rate": 0.00011011683219874323,
+ "loss": 1.2784,
+ "step": 97
+ },
+ {
+ "epoch": 1.9795918367346939,
+ "grad_norm": 0.285740464925766,
+ "learning_rate": 0.00010843510660430447,
+ "loss": 1.309,
+ "step": 98
+ },
+ {
+ "epoch": 2.0,
+ "grad_norm": 0.20554350316524506,
+ "learning_rate": 0.00010675097468583652,
+ "loss": 1.273,
+ "step": 99
+ },
+ {
+ "epoch": 2.020408163265306,
+ "grad_norm": 0.24468418955802917,
+ "learning_rate": 0.00010506491688387127,
+ "loss": 1.2833,
+ "step": 100
+ },
+ {
+ "epoch": 2.020408163265306,
+ "grad_norm": 0.21553528308868408,
+ "learning_rate": 0.00010337741418834684,
+ "loss": 1.2669,
+ "step": 101
+ },
+ {
+ "epoch": 2.0408163265306123,
+ "grad_norm": 0.22015659511089325,
+ "learning_rate": 0.0001016889480013931,
+ "loss": 1.2795,
+ "step": 102
+ },
+ {
+ "epoch": 2.061224489795918,
+ "grad_norm": 0.2028799206018448,
+ "learning_rate": 0.0001,
+ "loss": 1.2584,
+ "step": 103
+ },
+ {
+ "epoch": 2.0816326530612246,
+ "grad_norm": 0.23474323749542236,
+ "learning_rate": 9.83110519986069e-05,
+ "loss": 1.2761,
+ "step": 104
+ },
+ {
+ "epoch": 2.0816326530612246,
+ "eval_loss": 1.2696796655654907,
+ "eval_runtime": 270.6586,
+ "eval_samples_per_second": 6.263,
+ "eval_steps_per_second": 3.133,
+ "step": 104
+ },
+ {
+ "epoch": 2.1020408163265305,
+ "grad_norm": 0.21070216596126556,
+ "learning_rate": 9.662258581165319e-05,
+ "loss": 1.2808,
+ "step": 105
+ },
+ {
+ "epoch": 2.122448979591837,
+ "grad_norm": 0.21867221593856812,
+ "learning_rate": 9.493508311612874e-05,
+ "loss": 1.2873,
+ "step": 106
+ },
+ {
+ "epoch": 2.142857142857143,
+ "grad_norm": 0.21630822122097015,
+ "learning_rate": 9.324902531416349e-05,
+ "loss": 1.2527,
+ "step": 107
+ },
+ {
+ "epoch": 2.163265306122449,
+ "grad_norm": 0.2134082019329071,
+ "learning_rate": 9.156489339569554e-05,
+ "loss": 1.2755,
+ "step": 108
+ },
+ {
+ "epoch": 2.183673469387755,
+ "grad_norm": 0.22310714423656464,
+ "learning_rate": 8.98831678012568e-05,
+ "loss": 1.2512,
+ "step": 109
+ },
+ {
+ "epoch": 2.204081632653061,
+ "grad_norm": 0.2365124374628067,
+ "learning_rate": 8.820432828491542e-05,
+ "loss": 1.2725,
+ "step": 110
+ },
+ {
+ "epoch": 2.2244897959183674,
+ "grad_norm": 0.2086496651172638,
+ "learning_rate": 8.652885377741393e-05,
+ "loss": 1.2488,
+ "step": 111
+ },
+ {
+ "epoch": 2.2448979591836733,
+ "grad_norm": 0.20848101377487183,
+ "learning_rate": 8.485722224954237e-05,
+ "loss": 1.2793,
+ "step": 112
+ },
+ {
+ "epoch": 2.2653061224489797,
+ "grad_norm": 0.20784686505794525,
+ "learning_rate": 8.31899105757852e-05,
+ "loss": 1.2564,
+ "step": 113
+ },
+ {
+ "epoch": 2.2857142857142856,
+ "grad_norm": 0.21896174550056458,
+ "learning_rate": 8.15273943982811e-05,
+ "loss": 1.2515,
+ "step": 114
+ },
+ {
+ "epoch": 2.306122448979592,
+ "grad_norm": 0.21367855370044708,
+ "learning_rate": 7.987014799113397e-05,
+ "loss": 1.248,
+ "step": 115
+ },
+ {
+ "epoch": 2.326530612244898,
+ "grad_norm": 0.20891636610031128,
+ "learning_rate": 7.821864412511485e-05,
+ "loss": 1.2753,
+ "step": 116
+ },
+ {
+ "epoch": 2.3469387755102042,
+ "grad_norm": 0.2092975378036499,
+ "learning_rate": 7.65733539327918e-05,
+ "loss": 1.2509,
+ "step": 117
+ },
+ {
+ "epoch": 2.3469387755102042,
+ "eval_loss": 1.258699655532837,
+ "eval_runtime": 270.5384,
+ "eval_samples_per_second": 6.265,
+ "eval_steps_per_second": 3.134,
+ "step": 117
+ },
+ {
+ "epoch": 2.36734693877551,
+ "grad_norm": 0.1905972808599472,
+ "learning_rate": 7.493474677412794e-05,
+ "loss": 1.2516,
+ "step": 118
+ },
+ {
+ "epoch": 2.387755102040816,
+ "grad_norm": 0.19716158509254456,
+ "learning_rate": 7.330329010258483e-05,
+ "loss": 1.2665,
+ "step": 119
+ },
+ {
+ "epoch": 2.4081632653061225,
+ "grad_norm": 0.1953389048576355,
+ "learning_rate": 7.16794493317696e-05,
+ "loss": 1.2661,
+ "step": 120
+ },
+ {
+ "epoch": 2.4285714285714284,
+ "grad_norm": 0.1990067958831787,
+ "learning_rate": 7.006368770266421e-05,
+ "loss": 1.2619,
+ "step": 121
+ },
+ {
+ "epoch": 2.4489795918367347,
+ "grad_norm": 0.1954919546842575,
+ "learning_rate": 6.845646615147445e-05,
+ "loss": 1.2736,
+ "step": 122
+ },
+ {
+ "epoch": 2.4693877551020407,
+ "grad_norm": 0.18382853269577026,
+ "learning_rate": 6.685824317813643e-05,
+ "loss": 1.2732,
+ "step": 123
+ },
+ {
+ "epoch": 2.489795918367347,
+ "grad_norm": 0.18729491531848907,
+ "learning_rate": 6.526947471551798e-05,
+ "loss": 1.2509,
+ "step": 124
+ },
+ {
+ "epoch": 2.510204081632653,
+ "grad_norm": 0.2034740000963211,
+ "learning_rate": 6.369061399935255e-05,
+ "loss": 1.2829,
+ "step": 125
+ },
+ {
+ "epoch": 2.5306122448979593,
+ "grad_norm": 0.1952620893716812,
+ "learning_rate": 6.21221114389424e-05,
+ "loss": 1.2689,
+ "step": 126
+ },
+ {
+ "epoch": 2.5510204081632653,
+ "grad_norm": 0.1986168622970581,
+ "learning_rate": 6.0564414488668165e-05,
+ "loss": 1.2644,
+ "step": 127
+ },
+ {
+ "epoch": 2.571428571428571,
+ "grad_norm": 0.19526751339435577,
+ "learning_rate": 5.901796752034128e-05,
+ "loss": 1.265,
+ "step": 128
+ },
+ {
+ "epoch": 2.5918367346938775,
+ "grad_norm": 0.195367693901062,
+ "learning_rate": 5.748321169643596e-05,
+ "loss": 1.2782,
+ "step": 129
+ },
+ {
+ "epoch": 2.612244897959184,
+ "grad_norm": 0.18351928889751434,
+ "learning_rate": 5.596058484423656e-05,
+ "loss": 1.2884,
+ "step": 130
+ },
+ {
+ "epoch": 2.612244897959184,
+ "eval_loss": 1.2471545934677124,
+ "eval_runtime": 270.4953,
+ "eval_samples_per_second": 6.266,
+ "eval_steps_per_second": 3.135,
+ "step": 130
+ },
+ {
+ "epoch": 2.63265306122449,
+ "grad_norm": 0.2015760987997055,
+ "learning_rate": 5.44505213309366e-05,
+ "loss": 1.2536,
+ "step": 131
+ },
+ {
+ "epoch": 2.6530612244897958,
+ "grad_norm": 0.1734190732240677,
+ "learning_rate": 5.2953451939724454e-05,
+ "loss": 1.2628,
+ "step": 132
+ },
+ {
+ "epoch": 2.673469387755102,
+ "grad_norm": 0.214066281914711,
+ "learning_rate": 5.146980374689192e-05,
+ "loss": 1.2543,
+ "step": 133
+ },
+ {
+ "epoch": 2.693877551020408,
+ "grad_norm": 0.17507924139499664,
+ "learning_rate": 5.000000000000002e-05,
+ "loss": 1.2665,
+ "step": 134
+ },
+ {
+ "epoch": 2.7142857142857144,
+ "grad_norm": 0.1778109222650528,
+ "learning_rate": 4.854445999713715e-05,
+ "loss": 1.2789,
+ "step": 135
+ },
+ {
+ "epoch": 2.7346938775510203,
+ "grad_norm": 0.1856827288866043,
+ "learning_rate": 4.710359896730379e-05,
+ "loss": 1.2481,
+ "step": 136
+ },
+ {
+ "epoch": 2.7551020408163263,
+ "grad_norm": 0.17856694757938385,
+ "learning_rate": 4.567782795195816e-05,
+ "loss": 1.2732,
+ "step": 137
+ },
+ {
+ "epoch": 2.7755102040816326,
+ "grad_norm": 0.21598489582538605,
+ "learning_rate": 4.426755368775637e-05,
+ "loss": 1.2525,
+ "step": 138
+ },
+ {
+ "epoch": 2.795918367346939,
+ "grad_norm": 0.17308436334133148,
+ "learning_rate": 4.287317849052075e-05,
+ "loss": 1.2665,
+ "step": 139
+ },
+ {
+ "epoch": 2.816326530612245,
+ "grad_norm": 0.19207212328910828,
+ "learning_rate": 4.149510014046922e-05,
+ "loss": 1.2681,
+ "step": 140
+ },
+ {
+ "epoch": 2.836734693877551,
+ "grad_norm": 0.19626958668231964,
+ "learning_rate": 4.013371176873849e-05,
+ "loss": 1.2727,
+ "step": 141
+ },
+ {
+ "epoch": 2.857142857142857,
+ "grad_norm": 0.1986483484506607,
+ "learning_rate": 3.878940174523371e-05,
+ "loss": 1.2414,
+ "step": 142
+ },
+ {
+ "epoch": 2.877551020408163,
+ "grad_norm": 0.19369089603424072,
+ "learning_rate": 3.746255356783632e-05,
+ "loss": 1.254,
+ "step": 143
+ },
+ {
+ "epoch": 2.877551020408163,
+ "eval_loss": 1.2410293817520142,
+ "eval_runtime": 270.6762,
+ "eval_samples_per_second": 6.262,
+ "eval_steps_per_second": 3.133,
+ "step": 143
+ },
+ {
+ "epoch": 2.8979591836734695,
+ "grad_norm": 0.20910531282424927,
+ "learning_rate": 3.615354575300166e-05,
+ "loss": 1.2541,
+ "step": 144
+ },
+ {
+ "epoch": 2.9183673469387754,
+ "grad_norm": 0.19536806643009186,
+ "learning_rate": 3.4862751727777797e-05,
+ "loss": 1.2517,
+ "step": 145
+ },
+ {
+ "epoch": 2.938775510204082,
+ "grad_norm": 0.18630966544151306,
+ "learning_rate": 3.3590539723276083e-05,
+ "loss": 1.2473,
+ "step": 146
+ },
+ {
+ "epoch": 2.9591836734693877,
+ "grad_norm": 0.1874723732471466,
+ "learning_rate": 3.233727266962425e-05,
+ "loss": 1.244,
+ "step": 147
+ },
+ {
+ "epoch": 2.979591836734694,
+ "grad_norm": 0.1764463186264038,
+ "learning_rate": 3.110330809243134e-05,
+ "loss": 1.2465,
+ "step": 148
+ },
+ {
+ "epoch": 3.0,
+ "grad_norm": 0.16570010781288147,
+ "learning_rate": 2.9888998010794743e-05,
+ "loss": 1.2443,
+ "step": 149
+ },
+ {
+ "epoch": 3.020408163265306,
+ "grad_norm": 0.18820856511592865,
+ "learning_rate": 2.869468883687798e-05,
+ "loss": 1.2694,
+ "step": 150
+ },
+ {
+ "epoch": 3.020408163265306,
+ "grad_norm": 0.2009415626525879,
+ "learning_rate": 2.7520721277088024e-05,
+ "loss": 1.2185,
+ "step": 151
+ },
+ {
+ "epoch": 3.0408163265306123,
+ "grad_norm": 0.1824546605348587,
+ "learning_rate": 2.6367430234880284e-05,
+ "loss": 1.2222,
+ "step": 152
+ },
+ {
+ "epoch": 3.061224489795918,
+ "grad_norm": 0.180531844496727,
+ "learning_rate": 2.523514471521913e-05,
+ "loss": 1.2592,
+ "step": 153
+ },
+ {
+ "epoch": 3.0816326530612246,
+ "grad_norm": 0.17422904074192047,
+ "learning_rate": 2.4124187730720917e-05,
+ "loss": 1.2429,
+ "step": 154
+ },
+ {
+ "epoch": 3.1020408163265305,
+ "grad_norm": 0.17531636357307434,
+ "learning_rate": 2.3034876209506772e-05,
+ "loss": 1.2459,
+ "step": 155
+ },
+ {
+ "epoch": 3.122448979591837,
+ "grad_norm": 0.17256909608840942,
+ "learning_rate": 2.1967520904790827e-05,
+ "loss": 1.2523,
+ "step": 156
+ },
+ {
+ "epoch": 3.122448979591837,
+ "eval_loss": 1.240277886390686,
+ "eval_runtime": 270.7279,
+ "eval_samples_per_second": 6.261,
+ "eval_steps_per_second": 3.132,
+ "step": 156
+ },
+ {
+ "epoch": 3.142857142857143,
+ "grad_norm": 0.17711801826953888,
+ "learning_rate": 2.092242630623016e-05,
+ "loss": 1.2416,
+ "step": 157
+ },
+ {
+ "epoch": 3.163265306122449,
+ "grad_norm": 0.1642543524503708,
+ "learning_rate": 1.9899890553061562e-05,
+ "loss": 1.2563,
+ "step": 158
+ },
+ {
+ "epoch": 3.183673469387755,
+ "grad_norm": 0.17609795928001404,
+ "learning_rate": 1.8900205349049904e-05,
+ "loss": 1.2406,
+ "step": 159
+ },
+ {
+ "epoch": 3.204081632653061,
+ "grad_norm": 0.18534283339977264,
+ "learning_rate": 1.7923655879272393e-05,
+ "loss": 1.2522,
+ "step": 160
+ },
+ {
+ "epoch": 3.2244897959183674,
+ "grad_norm": 0.17926208674907684,
+ "learning_rate": 1.6970520728762375e-05,
+ "loss": 1.2315,
+ "step": 161
+ },
+ {
+ "epoch": 3.2448979591836733,
+ "grad_norm": 0.18245543539524078,
+ "learning_rate": 1.60410718030361e-05,
+ "loss": 1.2493,
+ "step": 162
+ },
+ {
+ "epoch": 3.2653061224489797,
+ "grad_norm": 0.16576482355594635,
+ "learning_rate": 1.5135574250524897e-05,
+ "loss": 1.2633,
+ "step": 163
+ },
+ {
+ "epoch": 3.2857142857142856,
+ "grad_norm": 0.1768399477005005,
+ "learning_rate": 1.425428638693489e-05,
+ "loss": 1.2399,
+ "step": 164
+ },
+ {
+ "epoch": 3.306122448979592,
+ "grad_norm": 0.17402540147304535,
+ "learning_rate": 1.339745962155613e-05,
+ "loss": 1.2574,
+ "step": 165
+ },
+ {
+ "epoch": 3.326530612244898,
+ "grad_norm": 0.17550399899482727,
+ "learning_rate": 1.2565338385541792e-05,
+ "loss": 1.2429,
+ "step": 166
+ },
+ {
+ "epoch": 3.3469387755102042,
+ "grad_norm": 0.18776686489582062,
+ "learning_rate": 1.1758160062178093e-05,
+ "loss": 1.2378,
+ "step": 167
+ },
+ {
+ "epoch": 3.36734693877551,
+ "grad_norm": 0.1816324144601822,
+ "learning_rate": 1.097615491916485e-05,
+ "loss": 1.2503,
+ "step": 168
+ },
+ {
+ "epoch": 3.387755102040816,
+ "grad_norm": 0.17802877724170685,
+ "learning_rate": 1.0219546042925843e-05,
+ "loss": 1.2468,
+ "step": 169
+ },
+ {
+ "epoch": 3.387755102040816,
+ "eval_loss": 1.2385426759719849,
+ "eval_runtime": 270.6389,
+ "eval_samples_per_second": 6.263,
+ "eval_steps_per_second": 3.133,
+ "step": 169
+ },
+ {
+ "epoch": 3.4081632653061225,
+ "grad_norm": 0.1731177568435669,
+ "learning_rate": 9.488549274967872e-06,
+ "loss": 1.2431,
+ "step": 170
+ },
+ {
+ "epoch": 3.4285714285714284,
+ "grad_norm": 0.16203820705413818,
+ "learning_rate": 8.783373150306661e-06,
+ "loss": 1.2394,
+ "step": 171
+ },
+ {
+ "epoch": 3.4489795918367347,
+ "grad_norm": 0.1603914201259613,
+ "learning_rate": 8.10421883797694e-06,
+ "loss": 1.2317,
+ "step": 172
+ },
+ {
+ "epoch": 3.4693877551020407,
+ "grad_norm": 0.16672447323799133,
+ "learning_rate": 7.4512800836440525e-06,
+ "loss": 1.2382,
+ "step": 173
+ },
+ {
+ "epoch": 3.489795918367347,
+ "grad_norm": 0.16903318464756012,
+ "learning_rate": 6.824743154333157e-06,
+ "loss": 1.2406,
+ "step": 174
+ },
+ {
+ "epoch": 3.510204081632653,
+ "grad_norm": 0.16718582808971405,
+ "learning_rate": 6.22478678529197e-06,
+ "loss": 1.2253,
+ "step": 175
+ },
+ {
+ "epoch": 3.5306122448979593,
+ "grad_norm": 0.16773243248462677,
+ "learning_rate": 5.651582129001986e-06,
+ "loss": 1.2545,
+ "step": 176
+ },
+ {
+ "epoch": 3.5510204081632653,
+ "grad_norm": 0.16658060252666473,
+ "learning_rate": 5.105292706353093e-06,
+ "loss": 1.2329,
+ "step": 177
+ },
+ {
+ "epoch": 3.571428571428571,
+ "grad_norm": 0.16760899126529694,
+ "learning_rate": 4.586074359995119e-06,
+ "loss": 1.2218,
+ "step": 178
+ },
+ {
+ "epoch": 3.5918367346938775,
+ "grad_norm": 0.17462213337421417,
+ "learning_rate": 4.094075209879788e-06,
+ "loss": 1.236,
+ "step": 179
+ },
+ {
+ "epoch": 3.612244897959184,
+ "grad_norm": 0.16253593564033508,
+ "learning_rate": 3.6294356110059157e-06,
+ "loss": 1.2518,
+ "step": 180
+ },
+ {
+ "epoch": 3.63265306122449,
+ "grad_norm": 0.16653120517730713,
+ "learning_rate": 3.1922881133795825e-06,
+ "loss": 1.2171,
+ "step": 181
+ },
+ {
+ "epoch": 3.6530612244897958,
+ "grad_norm": 0.1757594645023346,
+ "learning_rate": 2.7827574242009437e-06,
+ "loss": 1.2476,
+ "step": 182
+ },
+ {
+ "epoch": 3.6530612244897958,
+ "eval_loss": 1.237037181854248,
+ "eval_runtime": 270.3815,
+ "eval_samples_per_second": 6.269,
+ "eval_steps_per_second": 3.136,
+ "step": 182
+ },
+ {
+ "epoch": 3.673469387755102,
+ "grad_norm": 0.1665186882019043,
+ "learning_rate": 2.4009603722884742e-06,
+ "loss": 1.2497,
+ "step": 183
+ },
+ {
+ "epoch": 3.693877551020408,
+ "grad_norm": 0.17469817399978638,
+ "learning_rate": 2.0470058747505516e-06,
+ "loss": 1.2426,
+ "step": 184
+ },
+ {
+ "epoch": 3.7142857142857144,
+ "grad_norm": 0.17130160331726074,
+ "learning_rate": 1.7209949059142083e-06,
+ "loss": 1.2255,
+ "step": 185
+ },
+ {
+ "epoch": 3.7346938775510203,
+ "grad_norm": 0.1677573323249817,
+ "learning_rate": 1.4230204685196203e-06,
+ "loss": 1.2643,
+ "step": 186
+ },
+ {
+ "epoch": 3.7551020408163263,
+ "grad_norm": 0.16778886318206787,
+ "learning_rate": 1.1531675671888619e-06,
+ "loss": 1.234,
+ "step": 187
+ },
+ {
+ "epoch": 3.7755102040816326,
+ "grad_norm": 0.16397559642791748,
+ "learning_rate": 9.11513184176116e-07,
+ "loss": 1.2509,
+ "step": 188
+ },
+ {
+ "epoch": 3.795918367346939,
+ "grad_norm": 0.16539420187473297,
+ "learning_rate": 6.981262574066394e-07,
+ "loss": 1.2425,
+ "step": 189
+ },
+ {
+ "epoch": 3.816326530612245,
+ "grad_norm": 0.18255014717578888,
+ "learning_rate": 5.130676608104845e-07,
+ "loss": 1.2628,
+ "step": 190
+ },
+ {
+ "epoch": 3.836734693877551,
+ "grad_norm": 0.16024163365364075,
+ "learning_rate": 3.56390186956701e-07,
+ "loss": 1.2331,
+ "step": 191
+ },
+ {
+ "epoch": 3.857142857142857,
+ "grad_norm": 0.17575234174728394,
+ "learning_rate": 2.2813853199292746e-07,
+ "loss": 1.2497,
+ "step": 192
+ },
+ {
+ "epoch": 3.877551020408163,
+ "grad_norm": 0.1590609848499298,
+ "learning_rate": 1.2834928289472416e-07,
+ "loss": 1.2436,
+ "step": 193
+ },
+ {
+ "epoch": 3.8979591836734695,
+ "grad_norm": 0.17772971093654633,
+ "learning_rate": 5.705090702819993e-08,
+ "loss": 1.2361,
+ "step": 194
+ },
+ {
+ "epoch": 3.9183673469387754,
+ "grad_norm": 0.15970654785633087,
+ "learning_rate": 1.426374402901942e-08,
+ "loss": 1.2366,
+ "step": 195
+ },
+ {
+ "epoch": 3.9183673469387754,
+ "eval_loss": 1.2375136613845825,
+ "eval_runtime": 270.7418,
+ "eval_samples_per_second": 6.261,
+ "eval_steps_per_second": 3.132,
+ "step": 195
+ },
+ {
+ "epoch": 3.938775510204082,
+ "grad_norm": 0.15187527239322662,
+ "learning_rate": 0.0,
+ "loss": 1.2409,
+ "step": 196
+ }
+ ],
+ "logging_steps": 1,
+ "max_steps": 196,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 4,
+ "save_steps": 49,
+ "total_flos": 4.083740321198899e+16,
+ "train_batch_size": 2,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-196/training_args.bin b/checkpoint-196/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..be517173d48b34e38c41c52f74f7e02495562a61
--- /dev/null
+++ b/checkpoint-196/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a9cc4ff61a4799ac22d5c627a0169a20cc75619ae1b5871f2d114f95284d87a6
+size 5816
diff --git a/checkpoint-49/README.md b/checkpoint-49/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..78aadd3258d05c05cabc678473b6f0942cd441c7
--- /dev/null
+++ b/checkpoint-49/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.10.0
\ No newline at end of file
diff --git a/checkpoint-49/adapter_config.json b/checkpoint-49/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..3a88eecac3a652d9c19fe6d8e555dcac4859ccd2
--- /dev/null
+++ b/checkpoint-49/adapter_config.json
@@ -0,0 +1,34 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
+ "bias": "none",
+ "fan_in_fan_out": null,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 16,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 32,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "up_proj",
+ "o_proj",
+ "down_proj",
+ "k_proj",
+ "q_proj",
+ "gate_proj",
+ "v_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-49/adapter_model.safetensors b/checkpoint-49/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2be6674fb682f39b548ed9e9bd5da5609f3dc247
--- /dev/null
+++ b/checkpoint-49/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:15a7afbbb6db02fdac7ffe868d42729e1c9515f835763d3b9551db4ae31e3529
+size 100966336
diff --git a/checkpoint-49/optimizer.pt b/checkpoint-49/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..25c358c12ab69ae68df830c4eac3f66906a29560
--- /dev/null
+++ b/checkpoint-49/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9db4145fa287fcc2dc98bac341ab537efce6a4407796361cd24ac6b2176f6a70
+size 50916644
diff --git a/checkpoint-49/rng_state.pth b/checkpoint-49/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..da1cdd4a55a9f91350d2cac6a5db9d6937576d0a
--- /dev/null
+++ b/checkpoint-49/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:41e595b32f221472ac195c50986dfcd13bac01a4909d487f497aaa38e078d0c2
+size 14244
diff --git a/checkpoint-49/scheduler.pt b/checkpoint-49/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..17d9c23995b4c00077c7f6144a172ccd082a6603
--- /dev/null
+++ b/checkpoint-49/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6e5af14094f757ccb041613325b6c93fe808050ec47f3a4ec285ab4a0e229950
+size 1064
diff --git a/checkpoint-49/special_tokens_map.json b/checkpoint-49/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..72ecfeeb7e14d244c936169d2ed139eeae235ef1
--- /dev/null
+++ b/checkpoint-49/special_tokens_map.json
@@ -0,0 +1,24 @@
+{
+ "bos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": "",
+ "unk_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-49/tokenizer.model b/checkpoint-49/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899
--- /dev/null
+++ b/checkpoint-49/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723
diff --git a/checkpoint-49/tokenizer_config.json b/checkpoint-49/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0da2d3613b2ca0d42b48dea2162c06006f52b837
--- /dev/null
+++ b/checkpoint-49/tokenizer_config.json
@@ -0,0 +1,45 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "add_prefix_space": true,
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "",
+ "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "",
+ "legacy": false,
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": "",
+ "padding_side": "right",
+ "sp_model_kwargs": {},
+ "spaces_between_special_tokens": false,
+ "tokenizer_class": "LlamaTokenizer",
+ "unk_token": "",
+ "use_default_system_prompt": false,
+ "use_fast": true
+}
diff --git a/checkpoint-49/trainer_state.json b/checkpoint-49/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..b2df254a5cae83ed593542b19908da9e31dbd7c7
--- /dev/null
+++ b/checkpoint-49/trainer_state.json
@@ -0,0 +1,396 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 1.0,
+ "eval_steps": 13,
+ "global_step": 49,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.02040816326530612,
+ "grad_norm": 0.7881951332092285,
+ "learning_rate": 2e-05,
+ "loss": 2.7509,
+ "step": 1
+ },
+ {
+ "epoch": 0.02040816326530612,
+ "eval_loss": 2.6902382373809814,
+ "eval_runtime": 269.5606,
+ "eval_samples_per_second": 6.288,
+ "eval_steps_per_second": 3.146,
+ "step": 1
+ },
+ {
+ "epoch": 0.04081632653061224,
+ "grad_norm": 0.789082407951355,
+ "learning_rate": 4e-05,
+ "loss": 2.7449,
+ "step": 2
+ },
+ {
+ "epoch": 0.061224489795918366,
+ "grad_norm": 0.7354114055633545,
+ "learning_rate": 6e-05,
+ "loss": 2.7164,
+ "step": 3
+ },
+ {
+ "epoch": 0.08163265306122448,
+ "grad_norm": 0.7292255759239197,
+ "learning_rate": 8e-05,
+ "loss": 2.7174,
+ "step": 4
+ },
+ {
+ "epoch": 0.10204081632653061,
+ "grad_norm": 0.6898028254508972,
+ "learning_rate": 0.0001,
+ "loss": 2.6891,
+ "step": 5
+ },
+ {
+ "epoch": 0.12244897959183673,
+ "grad_norm": 0.6861400604248047,
+ "learning_rate": 0.00012,
+ "loss": 2.6545,
+ "step": 6
+ },
+ {
+ "epoch": 0.14285714285714285,
+ "grad_norm": 0.7510350346565247,
+ "learning_rate": 0.00014,
+ "loss": 2.5656,
+ "step": 7
+ },
+ {
+ "epoch": 0.16326530612244897,
+ "grad_norm": 0.8011165261268616,
+ "learning_rate": 0.00016,
+ "loss": 2.4519,
+ "step": 8
+ },
+ {
+ "epoch": 0.1836734693877551,
+ "grad_norm": 0.8624005317687988,
+ "learning_rate": 0.00018,
+ "loss": 2.3178,
+ "step": 9
+ },
+ {
+ "epoch": 0.20408163265306123,
+ "grad_norm": 0.8004987835884094,
+ "learning_rate": 0.0002,
+ "loss": 2.1783,
+ "step": 10
+ },
+ {
+ "epoch": 0.22448979591836735,
+ "grad_norm": 0.6362400054931641,
+ "learning_rate": 0.000199985736255971,
+ "loss": 2.0252,
+ "step": 11
+ },
+ {
+ "epoch": 0.24489795918367346,
+ "grad_norm": 0.7930936217308044,
+ "learning_rate": 0.0001999429490929718,
+ "loss": 1.8839,
+ "step": 12
+ },
+ {
+ "epoch": 0.2653061224489796,
+ "grad_norm": 0.5149843096733093,
+ "learning_rate": 0.00019987165071710527,
+ "loss": 1.8064,
+ "step": 13
+ },
+ {
+ "epoch": 0.2653061224489796,
+ "eval_loss": 1.6734941005706787,
+ "eval_runtime": 271.2615,
+ "eval_samples_per_second": 6.249,
+ "eval_steps_per_second": 3.126,
+ "step": 13
+ },
+ {
+ "epoch": 0.2857142857142857,
+ "grad_norm": 0.42121434211730957,
+ "learning_rate": 0.00019977186146800707,
+ "loss": 1.7922,
+ "step": 14
+ },
+ {
+ "epoch": 0.30612244897959184,
+ "grad_norm": 0.3523242771625519,
+ "learning_rate": 0.0001996436098130433,
+ "loss": 1.7711,
+ "step": 15
+ },
+ {
+ "epoch": 0.32653061224489793,
+ "grad_norm": 0.3384595215320587,
+ "learning_rate": 0.00019948693233918952,
+ "loss": 1.7152,
+ "step": 16
+ },
+ {
+ "epoch": 0.3469387755102041,
+ "grad_norm": 0.34942421317100525,
+ "learning_rate": 0.00019930187374259337,
+ "loss": 1.7112,
+ "step": 17
+ },
+ {
+ "epoch": 0.3673469387755102,
+ "grad_norm": 0.31712639331817627,
+ "learning_rate": 0.00019908848681582391,
+ "loss": 1.7059,
+ "step": 18
+ },
+ {
+ "epoch": 0.3877551020408163,
+ "grad_norm": 0.2875436842441559,
+ "learning_rate": 0.00019884683243281116,
+ "loss": 1.6468,
+ "step": 19
+ },
+ {
+ "epoch": 0.40816326530612246,
+ "grad_norm": 0.24433130025863647,
+ "learning_rate": 0.00019857697953148037,
+ "loss": 1.6408,
+ "step": 20
+ },
+ {
+ "epoch": 0.42857142857142855,
+ "grad_norm": 0.21414674818515778,
+ "learning_rate": 0.00019827900509408581,
+ "loss": 1.616,
+ "step": 21
+ },
+ {
+ "epoch": 0.4489795918367347,
+ "grad_norm": 0.21537622809410095,
+ "learning_rate": 0.00019795299412524945,
+ "loss": 1.609,
+ "step": 22
+ },
+ {
+ "epoch": 0.46938775510204084,
+ "grad_norm": 0.2432074397802353,
+ "learning_rate": 0.00019759903962771156,
+ "loss": 1.6066,
+ "step": 23
+ },
+ {
+ "epoch": 0.4897959183673469,
+ "grad_norm": 0.2359839379787445,
+ "learning_rate": 0.00019721724257579907,
+ "loss": 1.5851,
+ "step": 24
+ },
+ {
+ "epoch": 0.5102040816326531,
+ "grad_norm": 0.22065888345241547,
+ "learning_rate": 0.00019680771188662044,
+ "loss": 1.5739,
+ "step": 25
+ },
+ {
+ "epoch": 0.5306122448979592,
+ "grad_norm": 0.20339132845401764,
+ "learning_rate": 0.0001963705643889941,
+ "loss": 1.5513,
+ "step": 26
+ },
+ {
+ "epoch": 0.5306122448979592,
+ "eval_loss": 1.4832030534744263,
+ "eval_runtime": 271.2449,
+ "eval_samples_per_second": 6.249,
+ "eval_steps_per_second": 3.126,
+ "step": 26
+ },
+ {
+ "epoch": 0.5510204081632653,
+ "grad_norm": 0.18875224888324738,
+ "learning_rate": 0.00019590592479012023,
+ "loss": 1.5378,
+ "step": 27
+ },
+ {
+ "epoch": 0.5714285714285714,
+ "grad_norm": 0.18564417958259583,
+ "learning_rate": 0.00019541392564000488,
+ "loss": 1.5212,
+ "step": 28
+ },
+ {
+ "epoch": 0.5918367346938775,
+ "grad_norm": 0.16226942837238312,
+ "learning_rate": 0.00019489470729364692,
+ "loss": 1.5391,
+ "step": 29
+ },
+ {
+ "epoch": 0.6122448979591837,
+ "grad_norm": 0.15650039911270142,
+ "learning_rate": 0.00019434841787099803,
+ "loss": 1.511,
+ "step": 30
+ },
+ {
+ "epoch": 0.6326530612244898,
+ "grad_norm": 0.15976540744304657,
+ "learning_rate": 0.00019377521321470805,
+ "loss": 1.5119,
+ "step": 31
+ },
+ {
+ "epoch": 0.6530612244897959,
+ "grad_norm": 0.16409288346767426,
+ "learning_rate": 0.00019317525684566685,
+ "loss": 1.4909,
+ "step": 32
+ },
+ {
+ "epoch": 0.673469387755102,
+ "grad_norm": 0.15468019247055054,
+ "learning_rate": 0.00019254871991635598,
+ "loss": 1.4951,
+ "step": 33
+ },
+ {
+ "epoch": 0.6938775510204082,
+ "grad_norm": 0.1462036371231079,
+ "learning_rate": 0.00019189578116202307,
+ "loss": 1.4643,
+ "step": 34
+ },
+ {
+ "epoch": 0.7142857142857143,
+ "grad_norm": 0.1541963368654251,
+ "learning_rate": 0.00019121662684969335,
+ "loss": 1.5159,
+ "step": 35
+ },
+ {
+ "epoch": 0.7346938775510204,
+ "grad_norm": 0.14798064529895782,
+ "learning_rate": 0.00019051145072503215,
+ "loss": 1.4741,
+ "step": 36
+ },
+ {
+ "epoch": 0.7551020408163265,
+ "grad_norm": 0.13914817571640015,
+ "learning_rate": 0.00018978045395707418,
+ "loss": 1.4788,
+ "step": 37
+ },
+ {
+ "epoch": 0.7755102040816326,
+ "grad_norm": 0.15608824789524078,
+ "learning_rate": 0.00018902384508083517,
+ "loss": 1.4687,
+ "step": 38
+ },
+ {
+ "epoch": 0.7959183673469388,
+ "grad_norm": 0.14460116624832153,
+ "learning_rate": 0.00018824183993782192,
+ "loss": 1.482,
+ "step": 39
+ },
+ {
+ "epoch": 0.7959183673469388,
+ "eval_loss": 1.411073088645935,
+ "eval_runtime": 271.292,
+ "eval_samples_per_second": 6.248,
+ "eval_steps_per_second": 3.126,
+ "step": 39
+ },
+ {
+ "epoch": 0.8163265306122449,
+ "grad_norm": 0.15740551054477692,
+ "learning_rate": 0.00018743466161445823,
+ "loss": 1.4486,
+ "step": 40
+ },
+ {
+ "epoch": 0.8367346938775511,
+ "grad_norm": 0.14149661362171173,
+ "learning_rate": 0.00018660254037844388,
+ "loss": 1.4353,
+ "step": 41
+ },
+ {
+ "epoch": 0.8571428571428571,
+ "grad_norm": 0.14034292101860046,
+ "learning_rate": 0.0001857457136130651,
+ "loss": 1.4523,
+ "step": 42
+ },
+ {
+ "epoch": 0.8775510204081632,
+ "grad_norm": 0.1487722396850586,
+ "learning_rate": 0.00018486442574947511,
+ "loss": 1.4095,
+ "step": 43
+ },
+ {
+ "epoch": 0.8979591836734694,
+ "grad_norm": 0.17400234937667847,
+ "learning_rate": 0.00018395892819696389,
+ "loss": 1.4414,
+ "step": 44
+ },
+ {
+ "epoch": 0.9183673469387755,
+ "grad_norm": 0.1741325408220291,
+ "learning_rate": 0.00018302947927123766,
+ "loss": 1.4379,
+ "step": 45
+ },
+ {
+ "epoch": 0.9387755102040817,
+ "grad_norm": 0.15319454669952393,
+ "learning_rate": 0.00018207634412072764,
+ "loss": 1.405,
+ "step": 46
+ },
+ {
+ "epoch": 0.9591836734693877,
+ "grad_norm": 0.15876264870166779,
+ "learning_rate": 0.00018109979465095013,
+ "loss": 1.4122,
+ "step": 47
+ },
+ {
+ "epoch": 0.9795918367346939,
+ "grad_norm": 0.17120805382728577,
+ "learning_rate": 0.00018010010944693848,
+ "loss": 1.4132,
+ "step": 48
+ },
+ {
+ "epoch": 1.0,
+ "grad_norm": 0.1436116099357605,
+ "learning_rate": 0.00017907757369376985,
+ "loss": 1.416,
+ "step": 49
+ }
+ ],
+ "logging_steps": 1,
+ "max_steps": 196,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 4,
+ "save_steps": 49,
+ "total_flos": 1.0209350802997248e+16,
+ "train_batch_size": 2,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-49/training_args.bin b/checkpoint-49/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..be517173d48b34e38c41c52f74f7e02495562a61
--- /dev/null
+++ b/checkpoint-49/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a9cc4ff61a4799ac22d5c627a0169a20cc75619ae1b5871f2d114f95284d87a6
+size 5816
diff --git a/checkpoint-98/README.md b/checkpoint-98/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..78aadd3258d05c05cabc678473b6f0942cd441c7
--- /dev/null
+++ b/checkpoint-98/README.md
@@ -0,0 +1,202 @@
+---
+library_name: peft
+base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
+---
+
+# Model Card for Model ID
+
+
+
+
+
+## Model Details
+
+### Model Description
+
+
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+
+
+### Direct Use
+
+
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+
+
+[More Information Needed]
+
+### Recommendations
+
+
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+
+
+[More Information Needed]
+
+### Training Procedure
+
+
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed]
+
+#### Speeds, Sizes, Times [optional]
+
+
+
+[More Information Needed]
+
+## Evaluation
+
+
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+
+
+[More Information Needed]
+
+#### Factors
+
+
+
+[More Information Needed]
+
+#### Metrics
+
+
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+
+
+[More Information Needed]
+
+## Environmental Impact
+
+
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.10.0
\ No newline at end of file
diff --git a/checkpoint-98/adapter_config.json b/checkpoint-98/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..3a88eecac3a652d9c19fe6d8e555dcac4859ccd2
--- /dev/null
+++ b/checkpoint-98/adapter_config.json
@@ -0,0 +1,34 @@
+{
+ "alpha_pattern": {},
+ "auto_mapping": null,
+ "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
+ "bias": "none",
+ "fan_in_fan_out": null,
+ "inference_mode": true,
+ "init_lora_weights": true,
+ "layer_replication": null,
+ "layers_pattern": null,
+ "layers_to_transform": null,
+ "loftq_config": {},
+ "lora_alpha": 16,
+ "lora_dropout": 0.05,
+ "megatron_config": null,
+ "megatron_core": "megatron.core",
+ "modules_to_save": null,
+ "peft_type": "LORA",
+ "r": 32,
+ "rank_pattern": {},
+ "revision": null,
+ "target_modules": [
+ "up_proj",
+ "o_proj",
+ "down_proj",
+ "k_proj",
+ "q_proj",
+ "gate_proj",
+ "v_proj"
+ ],
+ "task_type": "CAUSAL_LM",
+ "use_dora": false,
+ "use_rslora": false
+}
\ No newline at end of file
diff --git a/checkpoint-98/adapter_model.safetensors b/checkpoint-98/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c25b0063b69349b0cb6176f283c217cdd9246694
--- /dev/null
+++ b/checkpoint-98/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88f74a76e06a6e5698ca16a682f4fa5d7e5c10182d165fe6c9327116444b10d0
+size 100966336
diff --git a/checkpoint-98/optimizer.pt b/checkpoint-98/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..eec9febc5f3485dcb35178d817f3969b8c7725b8
--- /dev/null
+++ b/checkpoint-98/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5c01f653a0ce9ea304a86d075b21cd51ea729659b91629c555eec65181dd1818
+size 50916644
diff --git a/checkpoint-98/rng_state.pth b/checkpoint-98/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..4f8bffc16c04a6d97a2f70d9897cc2b0f5b5ec32
--- /dev/null
+++ b/checkpoint-98/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff339d3bf5bb702320fd9a759e0988b159a701364f186575c95d51b72519d7a1
+size 14244
diff --git a/checkpoint-98/scheduler.pt b/checkpoint-98/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b09734329c851152a05af267cd70e47082a0b481
--- /dev/null
+++ b/checkpoint-98/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5e31465eabc96d2c0b0dc68386782c8ea3a5771edcba13d0d620c4297cd31957
+size 1064
diff --git a/checkpoint-98/special_tokens_map.json b/checkpoint-98/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..72ecfeeb7e14d244c936169d2ed139eeae235ef1
--- /dev/null
+++ b/checkpoint-98/special_tokens_map.json
@@ -0,0 +1,24 @@
+{
+ "bos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": "",
+ "unk_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/checkpoint-98/tokenizer.model b/checkpoint-98/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899
--- /dev/null
+++ b/checkpoint-98/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723
diff --git a/checkpoint-98/tokenizer_config.json b/checkpoint-98/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0da2d3613b2ca0d42b48dea2162c06006f52b837
--- /dev/null
+++ b/checkpoint-98/tokenizer_config.json
@@ -0,0 +1,45 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "add_prefix_space": true,
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "",
+ "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "",
+ "legacy": false,
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": "",
+ "padding_side": "right",
+ "sp_model_kwargs": {},
+ "spaces_between_special_tokens": false,
+ "tokenizer_class": "LlamaTokenizer",
+ "unk_token": "",
+ "use_default_system_prompt": false,
+ "use_fast": true
+}
diff --git a/checkpoint-98/trainer_state.json b/checkpoint-98/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..c68a94122621cbfdc175825e3de764c888b9d078
--- /dev/null
+++ b/checkpoint-98/trainer_state.json
@@ -0,0 +1,771 @@
+{
+ "best_metric": null,
+ "best_model_checkpoint": null,
+ "epoch": 1.9795918367346939,
+ "eval_steps": 13,
+ "global_step": 98,
+ "is_hyper_param_search": false,
+ "is_local_process_zero": true,
+ "is_world_process_zero": true,
+ "log_history": [
+ {
+ "epoch": 0.02040816326530612,
+ "grad_norm": 0.7881951332092285,
+ "learning_rate": 2e-05,
+ "loss": 2.7509,
+ "step": 1
+ },
+ {
+ "epoch": 0.02040816326530612,
+ "eval_loss": 2.6902382373809814,
+ "eval_runtime": 269.5606,
+ "eval_samples_per_second": 6.288,
+ "eval_steps_per_second": 3.146,
+ "step": 1
+ },
+ {
+ "epoch": 0.04081632653061224,
+ "grad_norm": 0.789082407951355,
+ "learning_rate": 4e-05,
+ "loss": 2.7449,
+ "step": 2
+ },
+ {
+ "epoch": 0.061224489795918366,
+ "grad_norm": 0.7354114055633545,
+ "learning_rate": 6e-05,
+ "loss": 2.7164,
+ "step": 3
+ },
+ {
+ "epoch": 0.08163265306122448,
+ "grad_norm": 0.7292255759239197,
+ "learning_rate": 8e-05,
+ "loss": 2.7174,
+ "step": 4
+ },
+ {
+ "epoch": 0.10204081632653061,
+ "grad_norm": 0.6898028254508972,
+ "learning_rate": 0.0001,
+ "loss": 2.6891,
+ "step": 5
+ },
+ {
+ "epoch": 0.12244897959183673,
+ "grad_norm": 0.6861400604248047,
+ "learning_rate": 0.00012,
+ "loss": 2.6545,
+ "step": 6
+ },
+ {
+ "epoch": 0.14285714285714285,
+ "grad_norm": 0.7510350346565247,
+ "learning_rate": 0.00014,
+ "loss": 2.5656,
+ "step": 7
+ },
+ {
+ "epoch": 0.16326530612244897,
+ "grad_norm": 0.8011165261268616,
+ "learning_rate": 0.00016,
+ "loss": 2.4519,
+ "step": 8
+ },
+ {
+ "epoch": 0.1836734693877551,
+ "grad_norm": 0.8624005317687988,
+ "learning_rate": 0.00018,
+ "loss": 2.3178,
+ "step": 9
+ },
+ {
+ "epoch": 0.20408163265306123,
+ "grad_norm": 0.8004987835884094,
+ "learning_rate": 0.0002,
+ "loss": 2.1783,
+ "step": 10
+ },
+ {
+ "epoch": 0.22448979591836735,
+ "grad_norm": 0.6362400054931641,
+ "learning_rate": 0.000199985736255971,
+ "loss": 2.0252,
+ "step": 11
+ },
+ {
+ "epoch": 0.24489795918367346,
+ "grad_norm": 0.7930936217308044,
+ "learning_rate": 0.0001999429490929718,
+ "loss": 1.8839,
+ "step": 12
+ },
+ {
+ "epoch": 0.2653061224489796,
+ "grad_norm": 0.5149843096733093,
+ "learning_rate": 0.00019987165071710527,
+ "loss": 1.8064,
+ "step": 13
+ },
+ {
+ "epoch": 0.2653061224489796,
+ "eval_loss": 1.6734941005706787,
+ "eval_runtime": 271.2615,
+ "eval_samples_per_second": 6.249,
+ "eval_steps_per_second": 3.126,
+ "step": 13
+ },
+ {
+ "epoch": 0.2857142857142857,
+ "grad_norm": 0.42121434211730957,
+ "learning_rate": 0.00019977186146800707,
+ "loss": 1.7922,
+ "step": 14
+ },
+ {
+ "epoch": 0.30612244897959184,
+ "grad_norm": 0.3523242771625519,
+ "learning_rate": 0.0001996436098130433,
+ "loss": 1.7711,
+ "step": 15
+ },
+ {
+ "epoch": 0.32653061224489793,
+ "grad_norm": 0.3384595215320587,
+ "learning_rate": 0.00019948693233918952,
+ "loss": 1.7152,
+ "step": 16
+ },
+ {
+ "epoch": 0.3469387755102041,
+ "grad_norm": 0.34942421317100525,
+ "learning_rate": 0.00019930187374259337,
+ "loss": 1.7112,
+ "step": 17
+ },
+ {
+ "epoch": 0.3673469387755102,
+ "grad_norm": 0.31712639331817627,
+ "learning_rate": 0.00019908848681582391,
+ "loss": 1.7059,
+ "step": 18
+ },
+ {
+ "epoch": 0.3877551020408163,
+ "grad_norm": 0.2875436842441559,
+ "learning_rate": 0.00019884683243281116,
+ "loss": 1.6468,
+ "step": 19
+ },
+ {
+ "epoch": 0.40816326530612246,
+ "grad_norm": 0.24433130025863647,
+ "learning_rate": 0.00019857697953148037,
+ "loss": 1.6408,
+ "step": 20
+ },
+ {
+ "epoch": 0.42857142857142855,
+ "grad_norm": 0.21414674818515778,
+ "learning_rate": 0.00019827900509408581,
+ "loss": 1.616,
+ "step": 21
+ },
+ {
+ "epoch": 0.4489795918367347,
+ "grad_norm": 0.21537622809410095,
+ "learning_rate": 0.00019795299412524945,
+ "loss": 1.609,
+ "step": 22
+ },
+ {
+ "epoch": 0.46938775510204084,
+ "grad_norm": 0.2432074397802353,
+ "learning_rate": 0.00019759903962771156,
+ "loss": 1.6066,
+ "step": 23
+ },
+ {
+ "epoch": 0.4897959183673469,
+ "grad_norm": 0.2359839379787445,
+ "learning_rate": 0.00019721724257579907,
+ "loss": 1.5851,
+ "step": 24
+ },
+ {
+ "epoch": 0.5102040816326531,
+ "grad_norm": 0.22065888345241547,
+ "learning_rate": 0.00019680771188662044,
+ "loss": 1.5739,
+ "step": 25
+ },
+ {
+ "epoch": 0.5306122448979592,
+ "grad_norm": 0.20339132845401764,
+ "learning_rate": 0.0001963705643889941,
+ "loss": 1.5513,
+ "step": 26
+ },
+ {
+ "epoch": 0.5306122448979592,
+ "eval_loss": 1.4832030534744263,
+ "eval_runtime": 271.2449,
+ "eval_samples_per_second": 6.249,
+ "eval_steps_per_second": 3.126,
+ "step": 26
+ },
+ {
+ "epoch": 0.5510204081632653,
+ "grad_norm": 0.18875224888324738,
+ "learning_rate": 0.00019590592479012023,
+ "loss": 1.5378,
+ "step": 27
+ },
+ {
+ "epoch": 0.5714285714285714,
+ "grad_norm": 0.18564417958259583,
+ "learning_rate": 0.00019541392564000488,
+ "loss": 1.5212,
+ "step": 28
+ },
+ {
+ "epoch": 0.5918367346938775,
+ "grad_norm": 0.16226942837238312,
+ "learning_rate": 0.00019489470729364692,
+ "loss": 1.5391,
+ "step": 29
+ },
+ {
+ "epoch": 0.6122448979591837,
+ "grad_norm": 0.15650039911270142,
+ "learning_rate": 0.00019434841787099803,
+ "loss": 1.511,
+ "step": 30
+ },
+ {
+ "epoch": 0.6326530612244898,
+ "grad_norm": 0.15976540744304657,
+ "learning_rate": 0.00019377521321470805,
+ "loss": 1.5119,
+ "step": 31
+ },
+ {
+ "epoch": 0.6530612244897959,
+ "grad_norm": 0.16409288346767426,
+ "learning_rate": 0.00019317525684566685,
+ "loss": 1.4909,
+ "step": 32
+ },
+ {
+ "epoch": 0.673469387755102,
+ "grad_norm": 0.15468019247055054,
+ "learning_rate": 0.00019254871991635598,
+ "loss": 1.4951,
+ "step": 33
+ },
+ {
+ "epoch": 0.6938775510204082,
+ "grad_norm": 0.1462036371231079,
+ "learning_rate": 0.00019189578116202307,
+ "loss": 1.4643,
+ "step": 34
+ },
+ {
+ "epoch": 0.7142857142857143,
+ "grad_norm": 0.1541963368654251,
+ "learning_rate": 0.00019121662684969335,
+ "loss": 1.5159,
+ "step": 35
+ },
+ {
+ "epoch": 0.7346938775510204,
+ "grad_norm": 0.14798064529895782,
+ "learning_rate": 0.00019051145072503215,
+ "loss": 1.4741,
+ "step": 36
+ },
+ {
+ "epoch": 0.7551020408163265,
+ "grad_norm": 0.13914817571640015,
+ "learning_rate": 0.00018978045395707418,
+ "loss": 1.4788,
+ "step": 37
+ },
+ {
+ "epoch": 0.7755102040816326,
+ "grad_norm": 0.15608824789524078,
+ "learning_rate": 0.00018902384508083517,
+ "loss": 1.4687,
+ "step": 38
+ },
+ {
+ "epoch": 0.7959183673469388,
+ "grad_norm": 0.14460116624832153,
+ "learning_rate": 0.00018824183993782192,
+ "loss": 1.482,
+ "step": 39
+ },
+ {
+ "epoch": 0.7959183673469388,
+ "eval_loss": 1.411073088645935,
+ "eval_runtime": 271.292,
+ "eval_samples_per_second": 6.248,
+ "eval_steps_per_second": 3.126,
+ "step": 39
+ },
+ {
+ "epoch": 0.8163265306122449,
+ "grad_norm": 0.15740551054477692,
+ "learning_rate": 0.00018743466161445823,
+ "loss": 1.4486,
+ "step": 40
+ },
+ {
+ "epoch": 0.8367346938775511,
+ "grad_norm": 0.14149661362171173,
+ "learning_rate": 0.00018660254037844388,
+ "loss": 1.4353,
+ "step": 41
+ },
+ {
+ "epoch": 0.8571428571428571,
+ "grad_norm": 0.14034292101860046,
+ "learning_rate": 0.0001857457136130651,
+ "loss": 1.4523,
+ "step": 42
+ },
+ {
+ "epoch": 0.8775510204081632,
+ "grad_norm": 0.1487722396850586,
+ "learning_rate": 0.00018486442574947511,
+ "loss": 1.4095,
+ "step": 43
+ },
+ {
+ "epoch": 0.8979591836734694,
+ "grad_norm": 0.17400234937667847,
+ "learning_rate": 0.00018395892819696389,
+ "loss": 1.4414,
+ "step": 44
+ },
+ {
+ "epoch": 0.9183673469387755,
+ "grad_norm": 0.1741325408220291,
+ "learning_rate": 0.00018302947927123766,
+ "loss": 1.4379,
+ "step": 45
+ },
+ {
+ "epoch": 0.9387755102040817,
+ "grad_norm": 0.15319454669952393,
+ "learning_rate": 0.00018207634412072764,
+ "loss": 1.405,
+ "step": 46
+ },
+ {
+ "epoch": 0.9591836734693877,
+ "grad_norm": 0.15876264870166779,
+ "learning_rate": 0.00018109979465095013,
+ "loss": 1.4122,
+ "step": 47
+ },
+ {
+ "epoch": 0.9795918367346939,
+ "grad_norm": 0.17120805382728577,
+ "learning_rate": 0.00018010010944693848,
+ "loss": 1.4132,
+ "step": 48
+ },
+ {
+ "epoch": 1.0,
+ "grad_norm": 0.1436116099357605,
+ "learning_rate": 0.00017907757369376985,
+ "loss": 1.416,
+ "step": 49
+ },
+ {
+ "epoch": 1.0204081632653061,
+ "grad_norm": 0.1707429438829422,
+ "learning_rate": 0.0001780324790952092,
+ "loss": 1.3913,
+ "step": 50
+ },
+ {
+ "epoch": 1.0204081632653061,
+ "grad_norm": 0.17117524147033691,
+ "learning_rate": 0.00017696512379049325,
+ "loss": 1.3963,
+ "step": 51
+ },
+ {
+ "epoch": 1.0408163265306123,
+ "grad_norm": 0.13410089910030365,
+ "learning_rate": 0.0001758758122692791,
+ "loss": 1.392,
+ "step": 52
+ },
+ {
+ "epoch": 1.0408163265306123,
+ "eval_loss": 1.3676769733428955,
+ "eval_runtime": 270.8566,
+ "eval_samples_per_second": 6.258,
+ "eval_steps_per_second": 3.131,
+ "step": 52
+ },
+ {
+ "epoch": 1.0612244897959184,
+ "grad_norm": 0.18877607583999634,
+ "learning_rate": 0.00017476485528478093,
+ "loss": 1.3854,
+ "step": 53
+ },
+ {
+ "epoch": 1.0816326530612246,
+ "grad_norm": 0.1752927452325821,
+ "learning_rate": 0.00017363256976511972,
+ "loss": 1.3759,
+ "step": 54
+ },
+ {
+ "epoch": 1.1020408163265305,
+ "grad_norm": 0.17180170118808746,
+ "learning_rate": 0.000172479278722912,
+ "loss": 1.3614,
+ "step": 55
+ },
+ {
+ "epoch": 1.1224489795918366,
+ "grad_norm": 0.1640290915966034,
+ "learning_rate": 0.00017130531116312203,
+ "loss": 1.3853,
+ "step": 56
+ },
+ {
+ "epoch": 1.1428571428571428,
+ "grad_norm": 0.2047068476676941,
+ "learning_rate": 0.0001701110019892053,
+ "loss": 1.3699,
+ "step": 57
+ },
+ {
+ "epoch": 1.163265306122449,
+ "grad_norm": 0.1835869997739792,
+ "learning_rate": 0.00016889669190756868,
+ "loss": 1.3403,
+ "step": 58
+ },
+ {
+ "epoch": 1.183673469387755,
+ "grad_norm": 0.16733241081237793,
+ "learning_rate": 0.00016766272733037576,
+ "loss": 1.3609,
+ "step": 59
+ },
+ {
+ "epoch": 1.2040816326530612,
+ "grad_norm": 0.178726926445961,
+ "learning_rate": 0.00016640946027672392,
+ "loss": 1.3651,
+ "step": 60
+ },
+ {
+ "epoch": 1.2244897959183674,
+ "grad_norm": 0.16719630360603333,
+ "learning_rate": 0.00016513724827222227,
+ "loss": 1.3676,
+ "step": 61
+ },
+ {
+ "epoch": 1.2448979591836735,
+ "grad_norm": 0.15999363362789154,
+ "learning_rate": 0.00016384645424699835,
+ "loss": 1.3651,
+ "step": 62
+ },
+ {
+ "epoch": 1.2653061224489797,
+ "grad_norm": 0.1705988198518753,
+ "learning_rate": 0.00016253744643216368,
+ "loss": 1.3757,
+ "step": 63
+ },
+ {
+ "epoch": 1.2857142857142856,
+ "grad_norm": 0.14996370673179626,
+ "learning_rate": 0.0001612105982547663,
+ "loss": 1.3474,
+ "step": 64
+ },
+ {
+ "epoch": 1.306122448979592,
+ "grad_norm": 0.19127260148525238,
+ "learning_rate": 0.0001598662882312615,
+ "loss": 1.3414,
+ "step": 65
+ },
+ {
+ "epoch": 1.306122448979592,
+ "eval_loss": 1.331880807876587,
+ "eval_runtime": 270.8424,
+ "eval_samples_per_second": 6.258,
+ "eval_steps_per_second": 3.131,
+ "step": 65
+ },
+ {
+ "epoch": 1.3265306122448979,
+ "grad_norm": 0.16125527024269104,
+ "learning_rate": 0.00015850489985953076,
+ "loss": 1.3509,
+ "step": 66
+ },
+ {
+ "epoch": 1.346938775510204,
+ "grad_norm": 0.1979473978281021,
+ "learning_rate": 0.00015712682150947923,
+ "loss": 1.3579,
+ "step": 67
+ },
+ {
+ "epoch": 1.3673469387755102,
+ "grad_norm": 0.18317992985248566,
+ "learning_rate": 0.00015573244631224365,
+ "loss": 1.3341,
+ "step": 68
+ },
+ {
+ "epoch": 1.3877551020408163,
+ "grad_norm": 0.1646898239850998,
+ "learning_rate": 0.0001543221720480419,
+ "loss": 1.3361,
+ "step": 69
+ },
+ {
+ "epoch": 1.4081632653061225,
+ "grad_norm": 0.1760271042585373,
+ "learning_rate": 0.00015289640103269625,
+ "loss": 1.358,
+ "step": 70
+ },
+ {
+ "epoch": 1.4285714285714286,
+ "grad_norm": 0.165283203125,
+ "learning_rate": 0.0001514555400028629,
+ "loss": 1.3072,
+ "step": 71
+ },
+ {
+ "epoch": 1.4489795918367347,
+ "grad_norm": 0.1507076472043991,
+ "learning_rate": 0.00015000000000000001,
+ "loss": 1.3133,
+ "step": 72
+ },
+ {
+ "epoch": 1.469387755102041,
+ "grad_norm": 0.16913647949695587,
+ "learning_rate": 0.00014853019625310813,
+ "loss": 1.3232,
+ "step": 73
+ },
+ {
+ "epoch": 1.489795918367347,
+ "grad_norm": 0.18266479671001434,
+ "learning_rate": 0.0001470465480602756,
+ "loss": 1.3512,
+ "step": 74
+ },
+ {
+ "epoch": 1.510204081632653,
+ "grad_norm": 0.19301828742027283,
+ "learning_rate": 0.0001455494786690634,
+ "loss": 1.3241,
+ "step": 75
+ },
+ {
+ "epoch": 1.5306122448979593,
+ "grad_norm": 0.16109652817249298,
+ "learning_rate": 0.00014403941515576344,
+ "loss": 1.3256,
+ "step": 76
+ },
+ {
+ "epoch": 1.5510204081632653,
+ "grad_norm": 0.17053867876529694,
+ "learning_rate": 0.00014251678830356408,
+ "loss": 1.3162,
+ "step": 77
+ },
+ {
+ "epoch": 1.5714285714285714,
+ "grad_norm": 0.17348544299602509,
+ "learning_rate": 0.00014098203247965875,
+ "loss": 1.3213,
+ "step": 78
+ },
+ {
+ "epoch": 1.5714285714285714,
+ "eval_loss": 1.3028697967529297,
+ "eval_runtime": 270.8095,
+ "eval_samples_per_second": 6.259,
+ "eval_steps_per_second": 3.131,
+ "step": 78
+ },
+ {
+ "epoch": 1.5918367346938775,
+ "grad_norm": 0.1703907549381256,
+ "learning_rate": 0.00013943558551133186,
+ "loss": 1.3073,
+ "step": 79
+ },
+ {
+ "epoch": 1.6122448979591837,
+ "grad_norm": 0.17313100397586823,
+ "learning_rate": 0.0001378778885610576,
+ "loss": 1.3232,
+ "step": 80
+ },
+ {
+ "epoch": 1.6326530612244898,
+ "grad_norm": 0.17237025499343872,
+ "learning_rate": 0.00013630938600064747,
+ "loss": 1.3406,
+ "step": 81
+ },
+ {
+ "epoch": 1.6530612244897958,
+ "grad_norm": 0.19658459722995758,
+ "learning_rate": 0.00013473052528448201,
+ "loss": 1.3114,
+ "step": 82
+ },
+ {
+ "epoch": 1.6734693877551021,
+ "grad_norm": 0.20599938929080963,
+ "learning_rate": 0.0001331417568218636,
+ "loss": 1.3288,
+ "step": 83
+ },
+ {
+ "epoch": 1.693877551020408,
+ "grad_norm": 0.17759399116039276,
+ "learning_rate": 0.00013154353384852558,
+ "loss": 1.2995,
+ "step": 84
+ },
+ {
+ "epoch": 1.7142857142857144,
+ "grad_norm": 0.18712250888347626,
+ "learning_rate": 0.00012993631229733582,
+ "loss": 1.2895,
+ "step": 85
+ },
+ {
+ "epoch": 1.7346938775510203,
+ "grad_norm": 0.1991330236196518,
+ "learning_rate": 0.00012832055066823038,
+ "loss": 1.2886,
+ "step": 86
+ },
+ {
+ "epoch": 1.7551020408163265,
+ "grad_norm": 0.22125203907489777,
+ "learning_rate": 0.00012669670989741517,
+ "loss": 1.3233,
+ "step": 87
+ },
+ {
+ "epoch": 1.7755102040816326,
+ "grad_norm": 0.2052813619375229,
+ "learning_rate": 0.00012506525322587207,
+ "loss": 1.3079,
+ "step": 88
+ },
+ {
+ "epoch": 1.7959183673469388,
+ "grad_norm": 0.19290736317634583,
+ "learning_rate": 0.00012342664606720822,
+ "loss": 1.3174,
+ "step": 89
+ },
+ {
+ "epoch": 1.816326530612245,
+ "grad_norm": 0.20912542939186096,
+ "learning_rate": 0.00012178135587488515,
+ "loss": 1.2915,
+ "step": 90
+ },
+ {
+ "epoch": 1.836734693877551,
+ "grad_norm": 0.20760588347911835,
+ "learning_rate": 0.00012012985200886602,
+ "loss": 1.3028,
+ "step": 91
+ },
+ {
+ "epoch": 1.836734693877551,
+ "eval_loss": 1.2795333862304688,
+ "eval_runtime": 270.6525,
+ "eval_samples_per_second": 6.263,
+ "eval_steps_per_second": 3.133,
+ "step": 91
+ },
+ {
+ "epoch": 1.8571428571428572,
+ "grad_norm": 0.1996900886297226,
+ "learning_rate": 0.00011847260560171896,
+ "loss": 1.3119,
+ "step": 92
+ },
+ {
+ "epoch": 1.8775510204081631,
+ "grad_norm": 0.23766876757144928,
+ "learning_rate": 0.00011681008942421483,
+ "loss": 1.2978,
+ "step": 93
+ },
+ {
+ "epoch": 1.8979591836734695,
+ "grad_norm": 0.19782397150993347,
+ "learning_rate": 0.00011514277775045768,
+ "loss": 1.2955,
+ "step": 94
+ },
+ {
+ "epoch": 1.9183673469387754,
+ "grad_norm": 0.22519494593143463,
+ "learning_rate": 0.00011347114622258612,
+ "loss": 1.2957,
+ "step": 95
+ },
+ {
+ "epoch": 1.9387755102040818,
+ "grad_norm": 0.2590245306491852,
+ "learning_rate": 0.00011179567171508463,
+ "loss": 1.2809,
+ "step": 96
+ },
+ {
+ "epoch": 1.9591836734693877,
+ "grad_norm": 0.2235420197248459,
+ "learning_rate": 0.00011011683219874323,
+ "loss": 1.2784,
+ "step": 97
+ },
+ {
+ "epoch": 1.9795918367346939,
+ "grad_norm": 0.285740464925766,
+ "learning_rate": 0.00010843510660430447,
+ "loss": 1.309,
+ "step": 98
+ }
+ ],
+ "logging_steps": 1,
+ "max_steps": 196,
+ "num_input_tokens_seen": 0,
+ "num_train_epochs": 4,
+ "save_steps": 49,
+ "total_flos": 2.0418701605994496e+16,
+ "train_batch_size": 2,
+ "trial_name": null,
+ "trial_params": null
+}
diff --git a/checkpoint-98/training_args.bin b/checkpoint-98/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..be517173d48b34e38c41c52f74f7e02495562a61
--- /dev/null
+++ b/checkpoint-98/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a9cc4ff61a4799ac22d5c627a0169a20cc75619ae1b5871f2d114f95284d87a6
+size 5816
diff --git a/config.json b/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..63c865633310e5244de44820c844cfc9e3b45dbd
--- /dev/null
+++ b/config.json
@@ -0,0 +1,43 @@
+{
+ "_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
+ "architectures": [
+ "LlamaForCausalLM"
+ ],
+ "attention_bias": false,
+ "attention_dropout": 0.0,
+ "bos_token_id": 1,
+ "eos_token_id": 2,
+ "hidden_act": "silu",
+ "hidden_size": 2048,
+ "initializer_range": 0.02,
+ "intermediate_size": 5632,
+ "max_position_embeddings": 4096,
+ "model_type": "llama",
+ "num_attention_heads": 32,
+ "num_hidden_layers": 22,
+ "num_key_value_heads": 4,
+ "pretraining_tp": 1,
+ "quantization_config": {
+ "_load_in_4bit": false,
+ "_load_in_8bit": true,
+ "bnb_4bit_compute_dtype": "float32",
+ "bnb_4bit_quant_storage": "uint8",
+ "bnb_4bit_quant_type": "fp4",
+ "bnb_4bit_use_double_quant": false,
+ "llm_int8_enable_fp32_cpu_offload": false,
+ "llm_int8_has_fp16_weight": false,
+ "llm_int8_skip_modules": null,
+ "llm_int8_threshold": 6.0,
+ "load_in_4bit": false,
+ "load_in_8bit": true,
+ "quant_method": "bitsandbytes"
+ },
+ "rms_norm_eps": 1e-05,
+ "rope_scaling": null,
+ "rope_theta": 10000.0,
+ "tie_word_embeddings": false,
+ "torch_dtype": "float32",
+ "transformers_version": "4.40.2",
+ "use_cache": false,
+ "vocab_size": 32000
+}
diff --git a/special_tokens_map.json b/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..72ecfeeb7e14d244c936169d2ed139eeae235ef1
--- /dev/null
+++ b/special_tokens_map.json
@@ -0,0 +1,24 @@
+{
+ "bos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": "",
+ "unk_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/tokenizer.model b/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899
--- /dev/null
+++ b/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723
diff --git a/tokenizer_config.json b/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0da2d3613b2ca0d42b48dea2162c06006f52b837
--- /dev/null
+++ b/tokenizer_config.json
@@ -0,0 +1,45 @@
+{
+ "add_bos_token": true,
+ "add_eos_token": false,
+ "add_prefix_space": true,
+ "added_tokens_decoder": {
+ "0": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "1": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "2": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "",
+ "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "",
+ "legacy": false,
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": "",
+ "padding_side": "right",
+ "sp_model_kwargs": {},
+ "spaces_between_special_tokens": false,
+ "tokenizer_class": "LlamaTokenizer",
+ "unk_token": "",
+ "use_default_system_prompt": false,
+ "use_fast": true
+}