nassersala commited on May 27, 2024

Commit

9e1f213

verified ·

1 Parent(s): e60ee44

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +156 -3
adapter_config.json +34 -0
adapter_model.bin +3 -0
checkpoint-147/README.md +202 -0
checkpoint-147/adapter_config.json +34 -0
checkpoint-147/adapter_model.safetensors +3 -0
checkpoint-147/optimizer.pt +3 -0
checkpoint-147/rng_state.pth +3 -0
checkpoint-147/scheduler.pt +3 -0
checkpoint-147/special_tokens_map.json +24 -0
checkpoint-147/tokenizer.model +3 -0
checkpoint-147/tokenizer_config.json +45 -0
checkpoint-147/trainer_state.json +1146 -0
checkpoint-147/training_args.bin +3 -0
checkpoint-196/README.md +202 -0
checkpoint-196/adapter_config.json +34 -0
checkpoint-196/adapter_model.safetensors +3 -0
checkpoint-196/optimizer.pt +3 -0
checkpoint-196/rng_state.pth +3 -0
checkpoint-196/scheduler.pt +3 -0
checkpoint-196/special_tokens_map.json +24 -0
checkpoint-196/tokenizer.model +3 -0
checkpoint-196/tokenizer_config.json +45 -0
checkpoint-196/trainer_state.json +1521 -0
checkpoint-196/training_args.bin +3 -0
checkpoint-49/README.md +202 -0
checkpoint-49/adapter_config.json +34 -0
checkpoint-49/adapter_model.safetensors +3 -0
checkpoint-49/optimizer.pt +3 -0
checkpoint-49/rng_state.pth +3 -0
checkpoint-49/scheduler.pt +3 -0
checkpoint-49/special_tokens_map.json +24 -0
checkpoint-49/tokenizer.model +3 -0
checkpoint-49/tokenizer_config.json +45 -0
checkpoint-49/trainer_state.json +396 -0
checkpoint-49/training_args.bin +3 -0
checkpoint-98/README.md +202 -0
checkpoint-98/adapter_config.json +34 -0
checkpoint-98/adapter_model.safetensors +3 -0
checkpoint-98/optimizer.pt +3 -0
checkpoint-98/rng_state.pth +3 -0
checkpoint-98/scheduler.pt +3 -0
checkpoint-98/special_tokens_map.json +24 -0
checkpoint-98/tokenizer.model +3 -0
checkpoint-98/tokenizer_config.json +45 -0
checkpoint-98/trainer_state.json +771 -0
checkpoint-98/training_args.bin +3 -0
config.json +43 -0
special_tokens_map.json +24 -0
tokenizer.model +3 -0

README.md CHANGED Viewed

@@ -1,3 +1,156 @@
----
-license: apache-2.0
----

+---
+license: apache-2.0
+library_name: peft
+tags:
+- generated_from_trainer
+base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
+model-index:
+- name: outputs/lora-out
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+[<img src="https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/OpenAccess-AI-Collective/axolotl)
+<details><summary>See axolotl config</summary>
+axolotl version: `0.4.0`
+```yaml
+base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
+model_type: LlamaForCausalLM
+tokenizer_type: LlamaTokenizer
+load_in_8bit: true
+load_in_4bit: false
+strict: false
+datasets:
+  - path: burkelibbey/colors
+    type:
+      field_instruction: color
+      field_output: description
+    conversation: chatml
+chat_template: chatml
+dataset_prepared_path:
+val_set_size: 0.05
+output_dir: ./outputs/lora-out
+sequence_len: 4096
+sample_packing: true
+eval_sample_packing: false
+pad_to_sequence_len: true
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_fan_in_fan_out:
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 4
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: false
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+warmup_steps: 10
+evals_per_epoch: 4
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+```
+</details><br>
+# outputs/lora-out
+This model is a fine-tuned version of [TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T](https://huggingface.co/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T) on the None dataset.
+It achieves the following results on the evaluation set:
+- Loss: 1.2375
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0002
+- train_batch_size: 2
+- eval_batch_size: 2
+- seed: 42
+- gradient_accumulation_steps: 4
+- total_train_batch_size: 8
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_steps: 10
+- num_epochs: 4
+### Training results
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 2.7509        | 0.0204 | 1    | 2.6902          |
+| 1.8064        | 0.2653 | 13   | 1.6735          |
+| 1.5513        | 0.5306 | 26   | 1.4832          |
+| 1.482         | 0.7959 | 39   | 1.4111          |
+| 1.392         | 1.0408 | 52   | 1.3677          |
+| 1.3414        | 1.3061 | 65   | 1.3319          |
+| 1.3213        | 1.5714 | 78   | 1.3029          |
+| 1.3028        | 1.8367 | 91   | 1.2795          |
+| 1.2761        | 2.0816 | 104  | 1.2697          |
+| 1.2509        | 2.3469 | 117  | 1.2587          |
+| 1.2884        | 2.6122 | 130  | 1.2472          |
+| 1.254         | 2.8776 | 143  | 1.2410          |
+| 1.2523        | 3.1224 | 156  | 1.2403          |
+| 1.2468        | 3.3878 | 169  | 1.2385          |
+| 1.2476        | 3.6531 | 182  | 1.2370          |
+| 1.2366        | 3.9184 | 195  | 1.2375          |
+### Framework versions
+- PEFT 0.10.0
+- Transformers 4.40.2
+- Pytorch 2.1.2+cu118
+- Datasets 2.19.1
+- Tokenizers 0.19.1

adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
+  "bias": "none",
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "o_proj",
+    "down_proj",
+    "k_proj",
+    "q_proj",
+    "gate_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b1f20d120db2b5b6953281cb7fa6e550c36182e6da8f44b598738a5995d5be6f
+size 101036698

checkpoint-147/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+library_name: peft
+base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.10.0

checkpoint-147/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
+  "bias": "none",
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "o_proj",
+    "down_proj",
+    "k_proj",
+    "q_proj",
+    "gate_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-147/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cb2ebdaf4b36ef443d056e4e52b5f0bf8223038232557b97bb7ce888df4d3c48
+size 100966336

checkpoint-147/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:88cf550811bb96f9852bdb7a8952d49f6f0bf413e95b0759a8db28fcab406988
+size 50916644

checkpoint-147/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f4e8611d6bcf761201e741bdb2188a6ac976702d2e3f1a3ecc21fff90ea8a001
+size 14244

checkpoint-147/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:83c1e2e1bea1da15cd4a47196fc191277510622d916f0b4b5e8c95f3258d5825
+size 1064

checkpoint-147/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-147/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

checkpoint-147/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false,
+  "use_fast": true
+}

checkpoint-147/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1146 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.9591836734693877,
+  "eval_steps": 13,
+  "global_step": 147,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.02040816326530612,
+      "grad_norm": 0.7881951332092285,
+      "learning_rate": 2e-05,
+      "loss": 2.7509,
+      "step": 1
+    },
+    {
+      "epoch": 0.02040816326530612,
+      "eval_loss": 2.6902382373809814,
+      "eval_runtime": 269.5606,
+      "eval_samples_per_second": 6.288,
+      "eval_steps_per_second": 3.146,
+      "step": 1
+    },
+    {
+      "epoch": 0.04081632653061224,
+      "grad_norm": 0.789082407951355,
+      "learning_rate": 4e-05,
+      "loss": 2.7449,
+      "step": 2
+    },
+    {
+      "epoch": 0.061224489795918366,
+      "grad_norm": 0.7354114055633545,
+      "learning_rate": 6e-05,
+      "loss": 2.7164,
+      "step": 3
+    },
+    {
+      "epoch": 0.08163265306122448,
+      "grad_norm": 0.7292255759239197,
+      "learning_rate": 8e-05,
+      "loss": 2.7174,
+      "step": 4
+    },
+    {
+      "epoch": 0.10204081632653061,
+      "grad_norm": 0.6898028254508972,
+      "learning_rate": 0.0001,
+      "loss": 2.6891,
+      "step": 5
+    },
+    {
+      "epoch": 0.12244897959183673,
+      "grad_norm": 0.6861400604248047,
+      "learning_rate": 0.00012,
+      "loss": 2.6545,
+      "step": 6
+    },
+    {
+      "epoch": 0.14285714285714285,
+      "grad_norm": 0.7510350346565247,
+      "learning_rate": 0.00014,
+      "loss": 2.5656,
+      "step": 7
+    },
+    {
+      "epoch": 0.16326530612244897,
+      "grad_norm": 0.8011165261268616,
+      "learning_rate": 0.00016,
+      "loss": 2.4519,
+      "step": 8
+    },
+    {
+      "epoch": 0.1836734693877551,
+      "grad_norm": 0.8624005317687988,
+      "learning_rate": 0.00018,
+      "loss": 2.3178,
+      "step": 9
+    },
+    {
+      "epoch": 0.20408163265306123,
+      "grad_norm": 0.8004987835884094,
+      "learning_rate": 0.0002,
+      "loss": 2.1783,
+      "step": 10
+    },
+    {
+      "epoch": 0.22448979591836735,
+      "grad_norm": 0.6362400054931641,
+      "learning_rate": 0.000199985736255971,
+      "loss": 2.0252,
+      "step": 11
+    },
+    {
+      "epoch": 0.24489795918367346,
+      "grad_norm": 0.7930936217308044,
+      "learning_rate": 0.0001999429490929718,
+      "loss": 1.8839,
+      "step": 12
+    },
+    {
+      "epoch": 0.2653061224489796,
+      "grad_norm": 0.5149843096733093,
+      "learning_rate": 0.00019987165071710527,
+      "loss": 1.8064,
+      "step": 13
+    },
+    {
+      "epoch": 0.2653061224489796,
+      "eval_loss": 1.6734941005706787,
+      "eval_runtime": 271.2615,
+      "eval_samples_per_second": 6.249,
+      "eval_steps_per_second": 3.126,
+      "step": 13
+    },
+    {
+      "epoch": 0.2857142857142857,
+      "grad_norm": 0.42121434211730957,
+      "learning_rate": 0.00019977186146800707,
+      "loss": 1.7922,
+      "step": 14
+    },
+    {
+      "epoch": 0.30612244897959184,
+      "grad_norm": 0.3523242771625519,
+      "learning_rate": 0.0001996436098130433,
+      "loss": 1.7711,
+      "step": 15
+    },
+    {
+      "epoch": 0.32653061224489793,
+      "grad_norm": 0.3384595215320587,
+      "learning_rate": 0.00019948693233918952,
+      "loss": 1.7152,
+      "step": 16
+    },
+    {
+      "epoch": 0.3469387755102041,
+      "grad_norm": 0.34942421317100525,
+      "learning_rate": 0.00019930187374259337,
+      "loss": 1.7112,
+      "step": 17
+    },
+    {
+      "epoch": 0.3673469387755102,
+      "grad_norm": 0.31712639331817627,
+      "learning_rate": 0.00019908848681582391,
+      "loss": 1.7059,
+      "step": 18
+    },
+    {
+      "epoch": 0.3877551020408163,
+      "grad_norm": 0.2875436842441559,
+      "learning_rate": 0.00019884683243281116,
+      "loss": 1.6468,
+      "step": 19
+    },
+    {
+      "epoch": 0.40816326530612246,
+      "grad_norm": 0.24433130025863647,
+      "learning_rate": 0.00019857697953148037,
+      "loss": 1.6408,
+      "step": 20
+    },
+    {
+      "epoch": 0.42857142857142855,
+      "grad_norm": 0.21414674818515778,
+      "learning_rate": 0.00019827900509408581,
+      "loss": 1.616,
+      "step": 21
+    },
+    {
+      "epoch": 0.4489795918367347,
+      "grad_norm": 0.21537622809410095,
+      "learning_rate": 0.00019795299412524945,
+      "loss": 1.609,
+      "step": 22
+    },
+    {
+      "epoch": 0.46938775510204084,
+      "grad_norm": 0.2432074397802353,
+      "learning_rate": 0.00019759903962771156,
+      "loss": 1.6066,
+      "step": 23
+    },
+    {
+      "epoch": 0.4897959183673469,
+      "grad_norm": 0.2359839379787445,
+      "learning_rate": 0.00019721724257579907,
+      "loss": 1.5851,
+      "step": 24
+    },
+    {
+      "epoch": 0.5102040816326531,
+      "grad_norm": 0.22065888345241547,
+      "learning_rate": 0.00019680771188662044,
+      "loss": 1.5739,
+      "step": 25
+    },
+    {
+      "epoch": 0.5306122448979592,
+      "grad_norm": 0.20339132845401764,
+      "learning_rate": 0.0001963705643889941,
+      "loss": 1.5513,
+      "step": 26
+    },
+    {
+      "epoch": 0.5306122448979592,
+      "eval_loss": 1.4832030534744263,
+      "eval_runtime": 271.2449,
+      "eval_samples_per_second": 6.249,
+      "eval_steps_per_second": 3.126,
+      "step": 26
+    },
+    {
+      "epoch": 0.5510204081632653,
+      "grad_norm": 0.18875224888324738,
+      "learning_rate": 0.00019590592479012023,
+      "loss": 1.5378,
+      "step": 27
+    },
+    {
+      "epoch": 0.5714285714285714,
+      "grad_norm": 0.18564417958259583,
+      "learning_rate": 0.00019541392564000488,
+      "loss": 1.5212,
+      "step": 28
+    },
+    {
+      "epoch": 0.5918367346938775,
+      "grad_norm": 0.16226942837238312,
+      "learning_rate": 0.00019489470729364692,
+      "loss": 1.5391,
+      "step": 29
+    },
+    {
+      "epoch": 0.6122448979591837,
+      "grad_norm": 0.15650039911270142,
+      "learning_rate": 0.00019434841787099803,
+      "loss": 1.511,
+      "step": 30
+    },
+    {
+      "epoch": 0.6326530612244898,
+      "grad_norm": 0.15976540744304657,
+      "learning_rate": 0.00019377521321470805,
+      "loss": 1.5119,
+      "step": 31
+    },
+    {
+      "epoch": 0.6530612244897959,
+      "grad_norm": 0.16409288346767426,
+      "learning_rate": 0.00019317525684566685,
+      "loss": 1.4909,
+      "step": 32
+    },
+    {
+      "epoch": 0.673469387755102,
+      "grad_norm": 0.15468019247055054,
+      "learning_rate": 0.00019254871991635598,
+      "loss": 1.4951,
+      "step": 33
+    },
+    {
+      "epoch": 0.6938775510204082,
+      "grad_norm": 0.1462036371231079,
+      "learning_rate": 0.00019189578116202307,
+      "loss": 1.4643,
+      "step": 34
+    },
+    {
+      "epoch": 0.7142857142857143,
+      "grad_norm": 0.1541963368654251,
+      "learning_rate": 0.00019121662684969335,
+      "loss": 1.5159,
+      "step": 35
+    },
+    {
+      "epoch": 0.7346938775510204,
+      "grad_norm": 0.14798064529895782,
+      "learning_rate": 0.00019051145072503215,
+      "loss": 1.4741,
+      "step": 36
+    },
+    {
+      "epoch": 0.7551020408163265,
+      "grad_norm": 0.13914817571640015,
+      "learning_rate": 0.00018978045395707418,
+      "loss": 1.4788,
+      "step": 37
+    },
+    {
+      "epoch": 0.7755102040816326,
+      "grad_norm": 0.15608824789524078,
+      "learning_rate": 0.00018902384508083517,
+      "loss": 1.4687,
+      "step": 38
+    },
+    {
+      "epoch": 0.7959183673469388,
+      "grad_norm": 0.14460116624832153,
+      "learning_rate": 0.00018824183993782192,
+      "loss": 1.482,
+      "step": 39
+    },
+    {
+      "epoch": 0.7959183673469388,
+      "eval_loss": 1.411073088645935,
+      "eval_runtime": 271.292,
+      "eval_samples_per_second": 6.248,
+      "eval_steps_per_second": 3.126,
+      "step": 39
+    },
+    {
+      "epoch": 0.8163265306122449,
+      "grad_norm": 0.15740551054477692,
+      "learning_rate": 0.00018743466161445823,
+      "loss": 1.4486,
+      "step": 40
+    },
+    {
+      "epoch": 0.8367346938775511,
+      "grad_norm": 0.14149661362171173,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 1.4353,
+      "step": 41
+    },
+    {
+      "epoch": 0.8571428571428571,
+      "grad_norm": 0.14034292101860046,
+      "learning_rate": 0.0001857457136130651,
+      "loss": 1.4523,
+      "step": 42
+    },
+    {
+      "epoch": 0.8775510204081632,
+      "grad_norm": 0.1487722396850586,
+      "learning_rate": 0.00018486442574947511,
+      "loss": 1.4095,
+      "step": 43
+    },
+    {
+      "epoch": 0.8979591836734694,
+      "grad_norm": 0.17400234937667847,
+      "learning_rate": 0.00018395892819696389,
+      "loss": 1.4414,
+      "step": 44
+    },
+    {
+      "epoch": 0.9183673469387755,
+      "grad_norm": 0.1741325408220291,
+      "learning_rate": 0.00018302947927123766,
+      "loss": 1.4379,
+      "step": 45
+    },
+    {
+      "epoch": 0.9387755102040817,
+      "grad_norm": 0.15319454669952393,
+      "learning_rate": 0.00018207634412072764,
+      "loss": 1.405,
+      "step": 46
+    },
+    {
+      "epoch": 0.9591836734693877,
+      "grad_norm": 0.15876264870166779,
+      "learning_rate": 0.00018109979465095013,
+      "loss": 1.4122,
+      "step": 47
+    },
+    {
+      "epoch": 0.9795918367346939,
+      "grad_norm": 0.17120805382728577,
+      "learning_rate": 0.00018010010944693848,
+      "loss": 1.4132,
+      "step": 48
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.1436116099357605,
+      "learning_rate": 0.00017907757369376985,
+      "loss": 1.416,
+      "step": 49
+    },
+    {
+      "epoch": 1.0204081632653061,
+      "grad_norm": 0.1707429438829422,
+      "learning_rate": 0.0001780324790952092,
+      "loss": 1.3913,
+      "step": 50
+    },
+    {
+      "epoch": 1.0204081632653061,
+      "grad_norm": 0.17117524147033691,
+      "learning_rate": 0.00017696512379049325,
+      "loss": 1.3963,
+      "step": 51
+    },
+    {
+      "epoch": 1.0408163265306123,
+      "grad_norm": 0.13410089910030365,
+      "learning_rate": 0.0001758758122692791,
+      "loss": 1.392,
+      "step": 52
+    },
+    {
+      "epoch": 1.0408163265306123,
+      "eval_loss": 1.3676769733428955,
+      "eval_runtime": 270.8566,
+      "eval_samples_per_second": 6.258,
+      "eval_steps_per_second": 3.131,
+      "step": 52
+    },
+    {
+      "epoch": 1.0612244897959184,
+      "grad_norm": 0.18877607583999634,
+      "learning_rate": 0.00017476485528478093,
+      "loss": 1.3854,
+      "step": 53
+    },
+    {
+      "epoch": 1.0816326530612246,
+      "grad_norm": 0.1752927452325821,
+      "learning_rate": 0.00017363256976511972,
+      "loss": 1.3759,
+      "step": 54
+    },
+    {
+      "epoch": 1.1020408163265305,
+      "grad_norm": 0.17180170118808746,
+      "learning_rate": 0.000172479278722912,
+      "loss": 1.3614,
+      "step": 55
+    },
+    {
+      "epoch": 1.1224489795918366,
+      "grad_norm": 0.1640290915966034,
+      "learning_rate": 0.00017130531116312203,
+      "loss": 1.3853,
+      "step": 56
+    },
+    {
+      "epoch": 1.1428571428571428,
+      "grad_norm": 0.2047068476676941,
+      "learning_rate": 0.0001701110019892053,
+      "loss": 1.3699,
+      "step": 57
+    },
+    {
+      "epoch": 1.163265306122449,
+      "grad_norm": 0.1835869997739792,
+      "learning_rate": 0.00016889669190756868,
+      "loss": 1.3403,
+      "step": 58
+    },
+    {
+      "epoch": 1.183673469387755,
+      "grad_norm": 0.16733241081237793,
+      "learning_rate": 0.00016766272733037576,
+      "loss": 1.3609,
+      "step": 59
+    },
+    {
+      "epoch": 1.2040816326530612,
+      "grad_norm": 0.178726926445961,
+      "learning_rate": 0.00016640946027672392,
+      "loss": 1.3651,
+      "step": 60
+    },
+    {
+      "epoch": 1.2244897959183674,
+      "grad_norm": 0.16719630360603333,
+      "learning_rate": 0.00016513724827222227,
+      "loss": 1.3676,
+      "step": 61
+    },
+    {
+      "epoch": 1.2448979591836735,
+      "grad_norm": 0.15999363362789154,
+      "learning_rate": 0.00016384645424699835,
+      "loss": 1.3651,
+      "step": 62
+    },
+    {
+      "epoch": 1.2653061224489797,
+      "grad_norm": 0.1705988198518753,
+      "learning_rate": 0.00016253744643216368,
+      "loss": 1.3757,
+      "step": 63
+    },
+    {
+      "epoch": 1.2857142857142856,
+      "grad_norm": 0.14996370673179626,
+      "learning_rate": 0.0001612105982547663,
+      "loss": 1.3474,
+      "step": 64
+    },
+    {
+      "epoch": 1.306122448979592,
+      "grad_norm": 0.19127260148525238,
+      "learning_rate": 0.0001598662882312615,
+      "loss": 1.3414,
+      "step": 65
+    },
+    {
+      "epoch": 1.306122448979592,
+      "eval_loss": 1.331880807876587,
+      "eval_runtime": 270.8424,
+      "eval_samples_per_second": 6.258,
+      "eval_steps_per_second": 3.131,
+      "step": 65
+    },
+    {
+      "epoch": 1.3265306122448979,
+      "grad_norm": 0.16125527024269104,
+      "learning_rate": 0.00015850489985953076,
+      "loss": 1.3509,
+      "step": 66
+    },
+    {
+      "epoch": 1.346938775510204,
+      "grad_norm": 0.1979473978281021,
+      "learning_rate": 0.00015712682150947923,
+      "loss": 1.3579,
+      "step": 67
+    },
+    {
+      "epoch": 1.3673469387755102,
+      "grad_norm": 0.18317992985248566,
+      "learning_rate": 0.00015573244631224365,
+      "loss": 1.3341,
+      "step": 68
+    },
+    {
+      "epoch": 1.3877551020408163,
+      "grad_norm": 0.1646898239850998,
+      "learning_rate": 0.0001543221720480419,
+      "loss": 1.3361,
+      "step": 69
+    },
+    {
+      "epoch": 1.4081632653061225,
+      "grad_norm": 0.1760271042585373,
+      "learning_rate": 0.00015289640103269625,
+      "loss": 1.358,
+      "step": 70
+    },
+    {
+      "epoch": 1.4285714285714286,
+      "grad_norm": 0.165283203125,
+      "learning_rate": 0.0001514555400028629,
+      "loss": 1.3072,
+      "step": 71
+    },
+    {
+      "epoch": 1.4489795918367347,
+      "grad_norm": 0.1507076472043991,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.3133,
+      "step": 72
+    },
+    {
+      "epoch": 1.469387755102041,
+      "grad_norm": 0.16913647949695587,
+      "learning_rate": 0.00014853019625310813,
+      "loss": 1.3232,
+      "step": 73
+    },
+    {
+      "epoch": 1.489795918367347,
+      "grad_norm": 0.18266479671001434,
+      "learning_rate": 0.0001470465480602756,
+      "loss": 1.3512,
+      "step": 74
+    },
+    {
+      "epoch": 1.510204081632653,
+      "grad_norm": 0.19301828742027283,
+      "learning_rate": 0.0001455494786690634,
+      "loss": 1.3241,
+      "step": 75
+    },
+    {
+      "epoch": 1.5306122448979593,
+      "grad_norm": 0.16109652817249298,
+      "learning_rate": 0.00014403941515576344,
+      "loss": 1.3256,
+      "step": 76
+    },
+    {
+      "epoch": 1.5510204081632653,
+      "grad_norm": 0.17053867876529694,
+      "learning_rate": 0.00014251678830356408,
+      "loss": 1.3162,
+      "step": 77
+    },
+    {
+      "epoch": 1.5714285714285714,
+      "grad_norm": 0.17348544299602509,
+      "learning_rate": 0.00014098203247965875,
+      "loss": 1.3213,
+      "step": 78
+    },
+    {
+      "epoch": 1.5714285714285714,
+      "eval_loss": 1.3028697967529297,
+      "eval_runtime": 270.8095,
+      "eval_samples_per_second": 6.259,
+      "eval_steps_per_second": 3.131,
+      "step": 78
+    },
+    {
+      "epoch": 1.5918367346938775,
+      "grad_norm": 0.1703907549381256,
+      "learning_rate": 0.00013943558551133186,
+      "loss": 1.3073,
+      "step": 79
+    },
+    {
+      "epoch": 1.6122448979591837,
+      "grad_norm": 0.17313100397586823,
+      "learning_rate": 0.0001378778885610576,
+      "loss": 1.3232,
+      "step": 80
+    },
+    {
+      "epoch": 1.6326530612244898,
+      "grad_norm": 0.17237025499343872,
+      "learning_rate": 0.00013630938600064747,
+      "loss": 1.3406,
+      "step": 81
+    },
+    {
+      "epoch": 1.6530612244897958,
+      "grad_norm": 0.19658459722995758,
+      "learning_rate": 0.00013473052528448201,
+      "loss": 1.3114,
+      "step": 82
+    },
+    {
+      "epoch": 1.6734693877551021,
+      "grad_norm": 0.20599938929080963,
+      "learning_rate": 0.0001331417568218636,
+      "loss": 1.3288,
+      "step": 83
+    },
+    {
+      "epoch": 1.693877551020408,
+      "grad_norm": 0.17759399116039276,
+      "learning_rate": 0.00013154353384852558,
+      "loss": 1.2995,
+      "step": 84
+    },
+    {
+      "epoch": 1.7142857142857144,
+      "grad_norm": 0.18712250888347626,
+      "learning_rate": 0.00012993631229733582,
+      "loss": 1.2895,
+      "step": 85
+    },
+    {
+      "epoch": 1.7346938775510203,
+      "grad_norm": 0.1991330236196518,
+      "learning_rate": 0.00012832055066823038,
+      "loss": 1.2886,
+      "step": 86
+    },
+    {
+      "epoch": 1.7551020408163265,
+      "grad_norm": 0.22125203907489777,
+      "learning_rate": 0.00012669670989741517,
+      "loss": 1.3233,
+      "step": 87
+    },
+    {
+      "epoch": 1.7755102040816326,
+      "grad_norm": 0.2052813619375229,
+      "learning_rate": 0.00012506525322587207,
+      "loss": 1.3079,
+      "step": 88
+    },
+    {
+      "epoch": 1.7959183673469388,
+      "grad_norm": 0.19290736317634583,
+      "learning_rate": 0.00012342664606720822,
+      "loss": 1.3174,
+      "step": 89
+    },
+    {
+      "epoch": 1.816326530612245,
+      "grad_norm": 0.20912542939186096,
+      "learning_rate": 0.00012178135587488515,
+      "loss": 1.2915,
+      "step": 90
+    },
+    {
+      "epoch": 1.836734693877551,
+      "grad_norm": 0.20760588347911835,
+      "learning_rate": 0.00012012985200886602,
+      "loss": 1.3028,
+      "step": 91
+    },
+    {
+      "epoch": 1.836734693877551,
+      "eval_loss": 1.2795333862304688,
+      "eval_runtime": 270.6525,
+      "eval_samples_per_second": 6.263,
+      "eval_steps_per_second": 3.133,
+      "step": 91
+    },
+    {
+      "epoch": 1.8571428571428572,
+      "grad_norm": 0.1996900886297226,
+      "learning_rate": 0.00011847260560171896,
+      "loss": 1.3119,
+      "step": 92
+    },
+    {
+      "epoch": 1.8775510204081631,
+      "grad_norm": 0.23766876757144928,
+      "learning_rate": 0.00011681008942421483,
+      "loss": 1.2978,
+      "step": 93
+    },
+    {
+      "epoch": 1.8979591836734695,
+      "grad_norm": 0.19782397150993347,
+      "learning_rate": 0.00011514277775045768,
+      "loss": 1.2955,
+      "step": 94
+    },
+    {
+      "epoch": 1.9183673469387754,
+      "grad_norm": 0.22519494593143463,
+      "learning_rate": 0.00011347114622258612,
+      "loss": 1.2957,
+      "step": 95
+    },
+    {
+      "epoch": 1.9387755102040818,
+      "grad_norm": 0.2590245306491852,
+      "learning_rate": 0.00011179567171508463,
+      "loss": 1.2809,
+      "step": 96
+    },
+    {
+      "epoch": 1.9591836734693877,
+      "grad_norm": 0.2235420197248459,
+      "learning_rate": 0.00011011683219874323,
+      "loss": 1.2784,
+      "step": 97
+    },
+    {
+      "epoch": 1.9795918367346939,
+      "grad_norm": 0.285740464925766,
+      "learning_rate": 0.00010843510660430447,
+      "loss": 1.309,
+      "step": 98
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.20554350316524506,
+      "learning_rate": 0.00010675097468583652,
+      "loss": 1.273,
+      "step": 99
+    },
+    {
+      "epoch": 2.020408163265306,
+      "grad_norm": 0.24468418955802917,
+      "learning_rate": 0.00010506491688387127,
+      "loss": 1.2833,
+      "step": 100
+    },
+    {
+      "epoch": 2.020408163265306,
+      "grad_norm": 0.21553528308868408,
+      "learning_rate": 0.00010337741418834684,
+      "loss": 1.2669,
+      "step": 101
+    },
+    {
+      "epoch": 2.0408163265306123,
+      "grad_norm": 0.22015659511089325,
+      "learning_rate": 0.0001016889480013931,
+      "loss": 1.2795,
+      "step": 102
+    },
+    {
+      "epoch": 2.061224489795918,
+      "grad_norm": 0.2028799206018448,
+      "learning_rate": 0.0001,
+      "loss": 1.2584,
+      "step": 103
+    },
+    {
+      "epoch": 2.0816326530612246,
+      "grad_norm": 0.23474323749542236,
+      "learning_rate": 9.83110519986069e-05,
+      "loss": 1.2761,
+      "step": 104
+    },
+    {
+      "epoch": 2.0816326530612246,
+      "eval_loss": 1.2696796655654907,
+      "eval_runtime": 270.6586,
+      "eval_samples_per_second": 6.263,
+      "eval_steps_per_second": 3.133,
+      "step": 104
+    },
+    {
+      "epoch": 2.1020408163265305,
+      "grad_norm": 0.21070216596126556,
+      "learning_rate": 9.662258581165319e-05,
+      "loss": 1.2808,
+      "step": 105
+    },
+    {
+      "epoch": 2.122448979591837,
+      "grad_norm": 0.21867221593856812,
+      "learning_rate": 9.493508311612874e-05,
+      "loss": 1.2873,
+      "step": 106
+    },
+    {
+      "epoch": 2.142857142857143,
+      "grad_norm": 0.21630822122097015,
+      "learning_rate": 9.324902531416349e-05,
+      "loss": 1.2527,
+      "step": 107
+    },
+    {
+      "epoch": 2.163265306122449,
+      "grad_norm": 0.2134082019329071,
+      "learning_rate": 9.156489339569554e-05,
+      "loss": 1.2755,
+      "step": 108
+    },
+    {
+      "epoch": 2.183673469387755,
+      "grad_norm": 0.22310714423656464,
+      "learning_rate": 8.98831678012568e-05,
+      "loss": 1.2512,
+      "step": 109
+    },
+    {
+      "epoch": 2.204081632653061,
+      "grad_norm": 0.2365124374628067,
+      "learning_rate": 8.820432828491542e-05,
+      "loss": 1.2725,
+      "step": 110
+    },
+    {
+      "epoch": 2.2244897959183674,
+      "grad_norm": 0.2086496651172638,
+      "learning_rate": 8.652885377741393e-05,
+      "loss": 1.2488,
+      "step": 111
+    },
+    {
+      "epoch": 2.2448979591836733,
+      "grad_norm": 0.20848101377487183,
+      "learning_rate": 8.485722224954237e-05,
+      "loss": 1.2793,
+      "step": 112
+    },
+    {
+      "epoch": 2.2653061224489797,
+      "grad_norm": 0.20784686505794525,
+      "learning_rate": 8.31899105757852e-05,
+      "loss": 1.2564,
+      "step": 113
+    },
+    {
+      "epoch": 2.2857142857142856,
+      "grad_norm": 0.21896174550056458,
+      "learning_rate": 8.15273943982811e-05,
+      "loss": 1.2515,
+      "step": 114
+    },
+    {
+      "epoch": 2.306122448979592,
+      "grad_norm": 0.21367855370044708,
+      "learning_rate": 7.987014799113397e-05,
+      "loss": 1.248,
+      "step": 115
+    },
+    {
+      "epoch": 2.326530612244898,
+      "grad_norm": 0.20891636610031128,
+      "learning_rate": 7.821864412511485e-05,
+      "loss": 1.2753,
+      "step": 116
+    },
+    {
+      "epoch": 2.3469387755102042,
+      "grad_norm": 0.2092975378036499,
+      "learning_rate": 7.65733539327918e-05,
+      "loss": 1.2509,
+      "step": 117
+    },
+    {
+      "epoch": 2.3469387755102042,
+      "eval_loss": 1.258699655532837,
+      "eval_runtime": 270.5384,
+      "eval_samples_per_second": 6.265,
+      "eval_steps_per_second": 3.134,
+      "step": 117
+    },
+    {
+      "epoch": 2.36734693877551,
+      "grad_norm": 0.1905972808599472,
+      "learning_rate": 7.493474677412794e-05,
+      "loss": 1.2516,
+      "step": 118
+    },
+    {
+      "epoch": 2.387755102040816,
+      "grad_norm": 0.19716158509254456,
+      "learning_rate": 7.330329010258483e-05,
+      "loss": 1.2665,
+      "step": 119
+    },
+    {
+      "epoch": 2.4081632653061225,
+      "grad_norm": 0.1953389048576355,
+      "learning_rate": 7.16794493317696e-05,
+      "loss": 1.2661,
+      "step": 120
+    },
+    {
+      "epoch": 2.4285714285714284,
+      "grad_norm": 0.1990067958831787,
+      "learning_rate": 7.006368770266421e-05,
+      "loss": 1.2619,
+      "step": 121
+    },
+    {
+      "epoch": 2.4489795918367347,
+      "grad_norm": 0.1954919546842575,
+      "learning_rate": 6.845646615147445e-05,
+      "loss": 1.2736,
+      "step": 122
+    },
+    {
+      "epoch": 2.4693877551020407,
+      "grad_norm": 0.18382853269577026,
+      "learning_rate": 6.685824317813643e-05,
+      "loss": 1.2732,
+      "step": 123
+    },
+    {
+      "epoch": 2.489795918367347,
+      "grad_norm": 0.18729491531848907,
+      "learning_rate": 6.526947471551798e-05,
+      "loss": 1.2509,
+      "step": 124
+    },
+    {
+      "epoch": 2.510204081632653,
+      "grad_norm": 0.2034740000963211,
+      "learning_rate": 6.369061399935255e-05,
+      "loss": 1.2829,
+      "step": 125
+    },
+    {
+      "epoch": 2.5306122448979593,
+      "grad_norm": 0.1952620893716812,
+      "learning_rate": 6.21221114389424e-05,
+      "loss": 1.2689,
+      "step": 126
+    },
+    {
+      "epoch": 2.5510204081632653,
+      "grad_norm": 0.1986168622970581,
+      "learning_rate": 6.0564414488668165e-05,
+      "loss": 1.2644,
+      "step": 127
+    },
+    {
+      "epoch": 2.571428571428571,
+      "grad_norm": 0.19526751339435577,
+      "learning_rate": 5.901796752034128e-05,
+      "loss": 1.265,
+      "step": 128
+    },
+    {
+      "epoch": 2.5918367346938775,
+      "grad_norm": 0.195367693901062,
+      "learning_rate": 5.748321169643596e-05,
+      "loss": 1.2782,
+      "step": 129
+    },
+    {
+      "epoch": 2.612244897959184,
+      "grad_norm": 0.18351928889751434,
+      "learning_rate": 5.596058484423656e-05,
+      "loss": 1.2884,
+      "step": 130
+    },
+    {
+      "epoch": 2.612244897959184,
+      "eval_loss": 1.2471545934677124,
+      "eval_runtime": 270.4953,
+      "eval_samples_per_second": 6.266,
+      "eval_steps_per_second": 3.135,
+      "step": 130
+    },
+    {
+      "epoch": 2.63265306122449,
+      "grad_norm": 0.2015760987997055,
+      "learning_rate": 5.44505213309366e-05,
+      "loss": 1.2536,
+      "step": 131
+    },
+    {
+      "epoch": 2.6530612244897958,
+      "grad_norm": 0.1734190732240677,
+      "learning_rate": 5.2953451939724454e-05,
+      "loss": 1.2628,
+      "step": 132
+    },
+    {
+      "epoch": 2.673469387755102,
+      "grad_norm": 0.214066281914711,
+      "learning_rate": 5.146980374689192e-05,
+      "loss": 1.2543,
+      "step": 133
+    },
+    {
+      "epoch": 2.693877551020408,
+      "grad_norm": 0.17507924139499664,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 1.2665,
+      "step": 134
+    },
+    {
+      "epoch": 2.7142857142857144,
+      "grad_norm": 0.1778109222650528,
+      "learning_rate": 4.854445999713715e-05,
+      "loss": 1.2789,
+      "step": 135
+    },
+    {
+      "epoch": 2.7346938775510203,
+      "grad_norm": 0.1856827288866043,
+      "learning_rate": 4.710359896730379e-05,
+      "loss": 1.2481,
+      "step": 136
+    },
+    {
+      "epoch": 2.7551020408163263,
+      "grad_norm": 0.17856694757938385,
+      "learning_rate": 4.567782795195816e-05,
+      "loss": 1.2732,
+      "step": 137
+    },
+    {
+      "epoch": 2.7755102040816326,
+      "grad_norm": 0.21598489582538605,
+      "learning_rate": 4.426755368775637e-05,
+      "loss": 1.2525,
+      "step": 138
+    },
+    {
+      "epoch": 2.795918367346939,
+      "grad_norm": 0.17308436334133148,
+      "learning_rate": 4.287317849052075e-05,
+      "loss": 1.2665,
+      "step": 139
+    },
+    {
+      "epoch": 2.816326530612245,
+      "grad_norm": 0.19207212328910828,
+      "learning_rate": 4.149510014046922e-05,
+      "loss": 1.2681,
+      "step": 140
+    },
+    {
+      "epoch": 2.836734693877551,
+      "grad_norm": 0.19626958668231964,
+      "learning_rate": 4.013371176873849e-05,
+      "loss": 1.2727,
+      "step": 141
+    },
+    {
+      "epoch": 2.857142857142857,
+      "grad_norm": 0.1986483484506607,
+      "learning_rate": 3.878940174523371e-05,
+      "loss": 1.2414,
+      "step": 142
+    },
+    {
+      "epoch": 2.877551020408163,
+      "grad_norm": 0.19369089603424072,
+      "learning_rate": 3.746255356783632e-05,
+      "loss": 1.254,
+      "step": 143
+    },
+    {
+      "epoch": 2.877551020408163,
+      "eval_loss": 1.2410293817520142,
+      "eval_runtime": 270.6762,
+      "eval_samples_per_second": 6.262,
+      "eval_steps_per_second": 3.133,
+      "step": 143
+    },
+    {
+      "epoch": 2.8979591836734695,
+      "grad_norm": 0.20910531282424927,
+      "learning_rate": 3.615354575300166e-05,
+      "loss": 1.2541,
+      "step": 144
+    },
+    {
+      "epoch": 2.9183673469387754,
+      "grad_norm": 0.19536806643009186,
+      "learning_rate": 3.4862751727777797e-05,
+      "loss": 1.2517,
+      "step": 145
+    },
+    {
+      "epoch": 2.938775510204082,
+      "grad_norm": 0.18630966544151306,
+      "learning_rate": 3.3590539723276083e-05,
+      "loss": 1.2473,
+      "step": 146
+    },
+    {
+      "epoch": 2.9591836734693877,
+      "grad_norm": 0.1874723732471466,
+      "learning_rate": 3.233727266962425e-05,
+      "loss": 1.244,
+      "step": 147
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 196,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 49,
+  "total_flos": 3.0628052408991744e+16,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-147/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a9cc4ff61a4799ac22d5c627a0169a20cc75619ae1b5871f2d114f95284d87a6
+size 5816

checkpoint-196/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+library_name: peft
+base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.10.0

checkpoint-196/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
+  "bias": "none",
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "o_proj",
+    "down_proj",
+    "k_proj",
+    "q_proj",
+    "gate_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-196/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c7149dfd1479c35b75fc75c4e9be3785070da91bd7c29d040e9a259ea5111014
+size 100966336

checkpoint-196/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:96f2429392a17aa7909b16091d5a0b62592f80090a1a9943b203b1e1c29e66f8
+size 50916644

checkpoint-196/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a160c2864b63ef158843056f3ba263b2da60c6bef707459f056731cde2e27043
+size 14244

checkpoint-196/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e22ca0a50bab80d00c8b8910bffb983a348f8762b7cf025e6f8e64a05a938289
+size 1064

checkpoint-196/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-196/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

checkpoint-196/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false,
+  "use_fast": true
+}

checkpoint-196/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1521 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.938775510204082,
+  "eval_steps": 13,
+  "global_step": 196,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.02040816326530612,
+      "grad_norm": 0.7881951332092285,
+      "learning_rate": 2e-05,
+      "loss": 2.7509,
+      "step": 1
+    },
+    {
+      "epoch": 0.02040816326530612,
+      "eval_loss": 2.6902382373809814,
+      "eval_runtime": 269.5606,
+      "eval_samples_per_second": 6.288,
+      "eval_steps_per_second": 3.146,
+      "step": 1
+    },
+    {
+      "epoch": 0.04081632653061224,
+      "grad_norm": 0.789082407951355,
+      "learning_rate": 4e-05,
+      "loss": 2.7449,
+      "step": 2
+    },
+    {
+      "epoch": 0.061224489795918366,
+      "grad_norm": 0.7354114055633545,
+      "learning_rate": 6e-05,
+      "loss": 2.7164,
+      "step": 3
+    },
+    {
+      "epoch": 0.08163265306122448,
+      "grad_norm": 0.7292255759239197,
+      "learning_rate": 8e-05,
+      "loss": 2.7174,
+      "step": 4
+    },
+    {
+      "epoch": 0.10204081632653061,
+      "grad_norm": 0.6898028254508972,
+      "learning_rate": 0.0001,
+      "loss": 2.6891,
+      "step": 5
+    },
+    {
+      "epoch": 0.12244897959183673,
+      "grad_norm": 0.6861400604248047,
+      "learning_rate": 0.00012,
+      "loss": 2.6545,
+      "step": 6
+    },
+    {
+      "epoch": 0.14285714285714285,
+      "grad_norm": 0.7510350346565247,
+      "learning_rate": 0.00014,
+      "loss": 2.5656,
+      "step": 7
+    },
+    {
+      "epoch": 0.16326530612244897,
+      "grad_norm": 0.8011165261268616,
+      "learning_rate": 0.00016,
+      "loss": 2.4519,
+      "step": 8
+    },
+    {
+      "epoch": 0.1836734693877551,
+      "grad_norm": 0.8624005317687988,
+      "learning_rate": 0.00018,
+      "loss": 2.3178,
+      "step": 9
+    },
+    {
+      "epoch": 0.20408163265306123,
+      "grad_norm": 0.8004987835884094,
+      "learning_rate": 0.0002,
+      "loss": 2.1783,
+      "step": 10
+    },
+    {
+      "epoch": 0.22448979591836735,
+      "grad_norm": 0.6362400054931641,
+      "learning_rate": 0.000199985736255971,
+      "loss": 2.0252,
+      "step": 11
+    },
+    {
+      "epoch": 0.24489795918367346,
+      "grad_norm": 0.7930936217308044,
+      "learning_rate": 0.0001999429490929718,
+      "loss": 1.8839,
+      "step": 12
+    },
+    {
+      "epoch": 0.2653061224489796,
+      "grad_norm": 0.5149843096733093,
+      "learning_rate": 0.00019987165071710527,
+      "loss": 1.8064,
+      "step": 13
+    },
+    {
+      "epoch": 0.2653061224489796,
+      "eval_loss": 1.6734941005706787,
+      "eval_runtime": 271.2615,
+      "eval_samples_per_second": 6.249,
+      "eval_steps_per_second": 3.126,
+      "step": 13
+    },
+    {
+      "epoch": 0.2857142857142857,
+      "grad_norm": 0.42121434211730957,
+      "learning_rate": 0.00019977186146800707,
+      "loss": 1.7922,
+      "step": 14
+    },
+    {
+      "epoch": 0.30612244897959184,
+      "grad_norm": 0.3523242771625519,
+      "learning_rate": 0.0001996436098130433,
+      "loss": 1.7711,
+      "step": 15
+    },
+    {
+      "epoch": 0.32653061224489793,
+      "grad_norm": 0.3384595215320587,
+      "learning_rate": 0.00019948693233918952,
+      "loss": 1.7152,
+      "step": 16
+    },
+    {
+      "epoch": 0.3469387755102041,
+      "grad_norm": 0.34942421317100525,
+      "learning_rate": 0.00019930187374259337,
+      "loss": 1.7112,
+      "step": 17
+    },
+    {
+      "epoch": 0.3673469387755102,
+      "grad_norm": 0.31712639331817627,
+      "learning_rate": 0.00019908848681582391,
+      "loss": 1.7059,
+      "step": 18
+    },
+    {
+      "epoch": 0.3877551020408163,
+      "grad_norm": 0.2875436842441559,
+      "learning_rate": 0.00019884683243281116,
+      "loss": 1.6468,
+      "step": 19
+    },
+    {
+      "epoch": 0.40816326530612246,
+      "grad_norm": 0.24433130025863647,
+      "learning_rate": 0.00019857697953148037,
+      "loss": 1.6408,
+      "step": 20
+    },
+    {
+      "epoch": 0.42857142857142855,
+      "grad_norm": 0.21414674818515778,
+      "learning_rate": 0.00019827900509408581,
+      "loss": 1.616,
+      "step": 21
+    },
+    {
+      "epoch": 0.4489795918367347,
+      "grad_norm": 0.21537622809410095,
+      "learning_rate": 0.00019795299412524945,
+      "loss": 1.609,
+      "step": 22
+    },
+    {
+      "epoch": 0.46938775510204084,
+      "grad_norm": 0.2432074397802353,
+      "learning_rate": 0.00019759903962771156,
+      "loss": 1.6066,
+      "step": 23
+    },
+    {
+      "epoch": 0.4897959183673469,
+      "grad_norm": 0.2359839379787445,
+      "learning_rate": 0.00019721724257579907,
+      "loss": 1.5851,
+      "step": 24
+    },
+    {
+      "epoch": 0.5102040816326531,
+      "grad_norm": 0.22065888345241547,
+      "learning_rate": 0.00019680771188662044,
+      "loss": 1.5739,
+      "step": 25
+    },
+    {
+      "epoch": 0.5306122448979592,
+      "grad_norm": 0.20339132845401764,
+      "learning_rate": 0.0001963705643889941,
+      "loss": 1.5513,
+      "step": 26
+    },
+    {
+      "epoch": 0.5306122448979592,
+      "eval_loss": 1.4832030534744263,
+      "eval_runtime": 271.2449,
+      "eval_samples_per_second": 6.249,
+      "eval_steps_per_second": 3.126,
+      "step": 26
+    },
+    {
+      "epoch": 0.5510204081632653,
+      "grad_norm": 0.18875224888324738,
+      "learning_rate": 0.00019590592479012023,
+      "loss": 1.5378,
+      "step": 27
+    },
+    {
+      "epoch": 0.5714285714285714,
+      "grad_norm": 0.18564417958259583,
+      "learning_rate": 0.00019541392564000488,
+      "loss": 1.5212,
+      "step": 28
+    },
+    {
+      "epoch": 0.5918367346938775,
+      "grad_norm": 0.16226942837238312,
+      "learning_rate": 0.00019489470729364692,
+      "loss": 1.5391,
+      "step": 29
+    },
+    {
+      "epoch": 0.6122448979591837,
+      "grad_norm": 0.15650039911270142,
+      "learning_rate": 0.00019434841787099803,
+      "loss": 1.511,
+      "step": 30
+    },
+    {
+      "epoch": 0.6326530612244898,
+      "grad_norm": 0.15976540744304657,
+      "learning_rate": 0.00019377521321470805,
+      "loss": 1.5119,
+      "step": 31
+    },
+    {
+      "epoch": 0.6530612244897959,
+      "grad_norm": 0.16409288346767426,
+      "learning_rate": 0.00019317525684566685,
+      "loss": 1.4909,
+      "step": 32
+    },
+    {
+      "epoch": 0.673469387755102,
+      "grad_norm": 0.15468019247055054,
+      "learning_rate": 0.00019254871991635598,
+      "loss": 1.4951,
+      "step": 33
+    },
+    {
+      "epoch": 0.6938775510204082,
+      "grad_norm": 0.1462036371231079,
+      "learning_rate": 0.00019189578116202307,
+      "loss": 1.4643,
+      "step": 34
+    },
+    {
+      "epoch": 0.7142857142857143,
+      "grad_norm": 0.1541963368654251,
+      "learning_rate": 0.00019121662684969335,
+      "loss": 1.5159,
+      "step": 35
+    },
+    {
+      "epoch": 0.7346938775510204,
+      "grad_norm": 0.14798064529895782,
+      "learning_rate": 0.00019051145072503215,
+      "loss": 1.4741,
+      "step": 36
+    },
+    {
+      "epoch": 0.7551020408163265,
+      "grad_norm": 0.13914817571640015,
+      "learning_rate": 0.00018978045395707418,
+      "loss": 1.4788,
+      "step": 37
+    },
+    {
+      "epoch": 0.7755102040816326,
+      "grad_norm": 0.15608824789524078,
+      "learning_rate": 0.00018902384508083517,
+      "loss": 1.4687,
+      "step": 38
+    },
+    {
+      "epoch": 0.7959183673469388,
+      "grad_norm": 0.14460116624832153,
+      "learning_rate": 0.00018824183993782192,
+      "loss": 1.482,
+      "step": 39
+    },
+    {
+      "epoch": 0.7959183673469388,
+      "eval_loss": 1.411073088645935,
+      "eval_runtime": 271.292,
+      "eval_samples_per_second": 6.248,
+      "eval_steps_per_second": 3.126,
+      "step": 39
+    },
+    {
+      "epoch": 0.8163265306122449,
+      "grad_norm": 0.15740551054477692,
+      "learning_rate": 0.00018743466161445823,
+      "loss": 1.4486,
+      "step": 40
+    },
+    {
+      "epoch": 0.8367346938775511,
+      "grad_norm": 0.14149661362171173,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 1.4353,
+      "step": 41
+    },
+    {
+      "epoch": 0.8571428571428571,
+      "grad_norm": 0.14034292101860046,
+      "learning_rate": 0.0001857457136130651,
+      "loss": 1.4523,
+      "step": 42
+    },
+    {
+      "epoch": 0.8775510204081632,
+      "grad_norm": 0.1487722396850586,
+      "learning_rate": 0.00018486442574947511,
+      "loss": 1.4095,
+      "step": 43
+    },
+    {
+      "epoch": 0.8979591836734694,
+      "grad_norm": 0.17400234937667847,
+      "learning_rate": 0.00018395892819696389,
+      "loss": 1.4414,
+      "step": 44
+    },
+    {
+      "epoch": 0.9183673469387755,
+      "grad_norm": 0.1741325408220291,
+      "learning_rate": 0.00018302947927123766,
+      "loss": 1.4379,
+      "step": 45
+    },
+    {
+      "epoch": 0.9387755102040817,
+      "grad_norm": 0.15319454669952393,
+      "learning_rate": 0.00018207634412072764,
+      "loss": 1.405,
+      "step": 46
+    },
+    {
+      "epoch": 0.9591836734693877,
+      "grad_norm": 0.15876264870166779,
+      "learning_rate": 0.00018109979465095013,
+      "loss": 1.4122,
+      "step": 47
+    },
+    {
+      "epoch": 0.9795918367346939,
+      "grad_norm": 0.17120805382728577,
+      "learning_rate": 0.00018010010944693848,
+      "loss": 1.4132,
+      "step": 48
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.1436116099357605,
+      "learning_rate": 0.00017907757369376985,
+      "loss": 1.416,
+      "step": 49
+    },
+    {
+      "epoch": 1.0204081632653061,
+      "grad_norm": 0.1707429438829422,
+      "learning_rate": 0.0001780324790952092,
+      "loss": 1.3913,
+      "step": 50
+    },
+    {
+      "epoch": 1.0204081632653061,
+      "grad_norm": 0.17117524147033691,
+      "learning_rate": 0.00017696512379049325,
+      "loss": 1.3963,
+      "step": 51
+    },
+    {
+      "epoch": 1.0408163265306123,
+      "grad_norm": 0.13410089910030365,
+      "learning_rate": 0.0001758758122692791,
+      "loss": 1.392,
+      "step": 52
+    },
+    {
+      "epoch": 1.0408163265306123,
+      "eval_loss": 1.3676769733428955,
+      "eval_runtime": 270.8566,
+      "eval_samples_per_second": 6.258,
+      "eval_steps_per_second": 3.131,
+      "step": 52
+    },
+    {
+      "epoch": 1.0612244897959184,
+      "grad_norm": 0.18877607583999634,
+      "learning_rate": 0.00017476485528478093,
+      "loss": 1.3854,
+      "step": 53
+    },
+    {
+      "epoch": 1.0816326530612246,
+      "grad_norm": 0.1752927452325821,
+      "learning_rate": 0.00017363256976511972,
+      "loss": 1.3759,
+      "step": 54
+    },
+    {
+      "epoch": 1.1020408163265305,
+      "grad_norm": 0.17180170118808746,
+      "learning_rate": 0.000172479278722912,
+      "loss": 1.3614,
+      "step": 55
+    },
+    {
+      "epoch": 1.1224489795918366,
+      "grad_norm": 0.1640290915966034,
+      "learning_rate": 0.00017130531116312203,
+      "loss": 1.3853,
+      "step": 56
+    },
+    {
+      "epoch": 1.1428571428571428,
+      "grad_norm": 0.2047068476676941,
+      "learning_rate": 0.0001701110019892053,
+      "loss": 1.3699,
+      "step": 57
+    },
+    {
+      "epoch": 1.163265306122449,
+      "grad_norm": 0.1835869997739792,
+      "learning_rate": 0.00016889669190756868,
+      "loss": 1.3403,
+      "step": 58
+    },
+    {
+      "epoch": 1.183673469387755,
+      "grad_norm": 0.16733241081237793,
+      "learning_rate": 0.00016766272733037576,
+      "loss": 1.3609,
+      "step": 59
+    },
+    {
+      "epoch": 1.2040816326530612,
+      "grad_norm": 0.178726926445961,
+      "learning_rate": 0.00016640946027672392,
+      "loss": 1.3651,
+      "step": 60
+    },
+    {
+      "epoch": 1.2244897959183674,
+      "grad_norm": 0.16719630360603333,
+      "learning_rate": 0.00016513724827222227,
+      "loss": 1.3676,
+      "step": 61
+    },
+    {
+      "epoch": 1.2448979591836735,
+      "grad_norm": 0.15999363362789154,
+      "learning_rate": 0.00016384645424699835,
+      "loss": 1.3651,
+      "step": 62
+    },
+    {
+      "epoch": 1.2653061224489797,
+      "grad_norm": 0.1705988198518753,
+      "learning_rate": 0.00016253744643216368,
+      "loss": 1.3757,
+      "step": 63
+    },
+    {
+      "epoch": 1.2857142857142856,
+      "grad_norm": 0.14996370673179626,
+      "learning_rate": 0.0001612105982547663,
+      "loss": 1.3474,
+      "step": 64
+    },
+    {
+      "epoch": 1.306122448979592,
+      "grad_norm": 0.19127260148525238,
+      "learning_rate": 0.0001598662882312615,
+      "loss": 1.3414,
+      "step": 65
+    },
+    {
+      "epoch": 1.306122448979592,
+      "eval_loss": 1.331880807876587,
+      "eval_runtime": 270.8424,
+      "eval_samples_per_second": 6.258,
+      "eval_steps_per_second": 3.131,
+      "step": 65
+    },
+    {
+      "epoch": 1.3265306122448979,
+      "grad_norm": 0.16125527024269104,
+      "learning_rate": 0.00015850489985953076,
+      "loss": 1.3509,
+      "step": 66
+    },
+    {
+      "epoch": 1.346938775510204,
+      "grad_norm": 0.1979473978281021,
+      "learning_rate": 0.00015712682150947923,
+      "loss": 1.3579,
+      "step": 67
+    },
+    {
+      "epoch": 1.3673469387755102,
+      "grad_norm": 0.18317992985248566,
+      "learning_rate": 0.00015573244631224365,
+      "loss": 1.3341,
+      "step": 68
+    },
+    {
+      "epoch": 1.3877551020408163,
+      "grad_norm": 0.1646898239850998,
+      "learning_rate": 0.0001543221720480419,
+      "loss": 1.3361,
+      "step": 69
+    },
+    {
+      "epoch": 1.4081632653061225,
+      "grad_norm": 0.1760271042585373,
+      "learning_rate": 0.00015289640103269625,
+      "loss": 1.358,
+      "step": 70
+    },
+    {
+      "epoch": 1.4285714285714286,
+      "grad_norm": 0.165283203125,
+      "learning_rate": 0.0001514555400028629,
+      "loss": 1.3072,
+      "step": 71
+    },
+    {
+      "epoch": 1.4489795918367347,
+      "grad_norm": 0.1507076472043991,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.3133,
+      "step": 72
+    },
+    {
+      "epoch": 1.469387755102041,
+      "grad_norm": 0.16913647949695587,
+      "learning_rate": 0.00014853019625310813,
+      "loss": 1.3232,
+      "step": 73
+    },
+    {
+      "epoch": 1.489795918367347,
+      "grad_norm": 0.18266479671001434,
+      "learning_rate": 0.0001470465480602756,
+      "loss": 1.3512,
+      "step": 74
+    },
+    {
+      "epoch": 1.510204081632653,
+      "grad_norm": 0.19301828742027283,
+      "learning_rate": 0.0001455494786690634,
+      "loss": 1.3241,
+      "step": 75
+    },
+    {
+      "epoch": 1.5306122448979593,
+      "grad_norm": 0.16109652817249298,
+      "learning_rate": 0.00014403941515576344,
+      "loss": 1.3256,
+      "step": 76
+    },
+    {
+      "epoch": 1.5510204081632653,
+      "grad_norm": 0.17053867876529694,
+      "learning_rate": 0.00014251678830356408,
+      "loss": 1.3162,
+      "step": 77
+    },
+    {
+      "epoch": 1.5714285714285714,
+      "grad_norm": 0.17348544299602509,
+      "learning_rate": 0.00014098203247965875,
+      "loss": 1.3213,
+      "step": 78
+    },
+    {
+      "epoch": 1.5714285714285714,
+      "eval_loss": 1.3028697967529297,
+      "eval_runtime": 270.8095,
+      "eval_samples_per_second": 6.259,
+      "eval_steps_per_second": 3.131,
+      "step": 78
+    },
+    {
+      "epoch": 1.5918367346938775,
+      "grad_norm": 0.1703907549381256,
+      "learning_rate": 0.00013943558551133186,
+      "loss": 1.3073,
+      "step": 79
+    },
+    {
+      "epoch": 1.6122448979591837,
+      "grad_norm": 0.17313100397586823,
+      "learning_rate": 0.0001378778885610576,
+      "loss": 1.3232,
+      "step": 80
+    },
+    {
+      "epoch": 1.6326530612244898,
+      "grad_norm": 0.17237025499343872,
+      "learning_rate": 0.00013630938600064747,
+      "loss": 1.3406,
+      "step": 81
+    },
+    {
+      "epoch": 1.6530612244897958,
+      "grad_norm": 0.19658459722995758,
+      "learning_rate": 0.00013473052528448201,
+      "loss": 1.3114,
+      "step": 82
+    },
+    {
+      "epoch": 1.6734693877551021,
+      "grad_norm": 0.20599938929080963,
+      "learning_rate": 0.0001331417568218636,
+      "loss": 1.3288,
+      "step": 83
+    },
+    {
+      "epoch": 1.693877551020408,
+      "grad_norm": 0.17759399116039276,
+      "learning_rate": 0.00013154353384852558,
+      "loss": 1.2995,
+      "step": 84
+    },
+    {
+      "epoch": 1.7142857142857144,
+      "grad_norm": 0.18712250888347626,
+      "learning_rate": 0.00012993631229733582,
+      "loss": 1.2895,
+      "step": 85
+    },
+    {
+      "epoch": 1.7346938775510203,
+      "grad_norm": 0.1991330236196518,
+      "learning_rate": 0.00012832055066823038,
+      "loss": 1.2886,
+      "step": 86
+    },
+    {
+      "epoch": 1.7551020408163265,
+      "grad_norm": 0.22125203907489777,
+      "learning_rate": 0.00012669670989741517,
+      "loss": 1.3233,
+      "step": 87
+    },
+    {
+      "epoch": 1.7755102040816326,
+      "grad_norm": 0.2052813619375229,
+      "learning_rate": 0.00012506525322587207,
+      "loss": 1.3079,
+      "step": 88
+    },
+    {
+      "epoch": 1.7959183673469388,
+      "grad_norm": 0.19290736317634583,
+      "learning_rate": 0.00012342664606720822,
+      "loss": 1.3174,
+      "step": 89
+    },
+    {
+      "epoch": 1.816326530612245,
+      "grad_norm": 0.20912542939186096,
+      "learning_rate": 0.00012178135587488515,
+      "loss": 1.2915,
+      "step": 90
+    },
+    {
+      "epoch": 1.836734693877551,
+      "grad_norm": 0.20760588347911835,
+      "learning_rate": 0.00012012985200886602,
+      "loss": 1.3028,
+      "step": 91
+    },
+    {
+      "epoch": 1.836734693877551,
+      "eval_loss": 1.2795333862304688,
+      "eval_runtime": 270.6525,
+      "eval_samples_per_second": 6.263,
+      "eval_steps_per_second": 3.133,
+      "step": 91
+    },
+    {
+      "epoch": 1.8571428571428572,
+      "grad_norm": 0.1996900886297226,
+      "learning_rate": 0.00011847260560171896,
+      "loss": 1.3119,
+      "step": 92
+    },
+    {
+      "epoch": 1.8775510204081631,
+      "grad_norm": 0.23766876757144928,
+      "learning_rate": 0.00011681008942421483,
+      "loss": 1.2978,
+      "step": 93
+    },
+    {
+      "epoch": 1.8979591836734695,
+      "grad_norm": 0.19782397150993347,
+      "learning_rate": 0.00011514277775045768,
+      "loss": 1.2955,
+      "step": 94
+    },
+    {
+      "epoch": 1.9183673469387754,
+      "grad_norm": 0.22519494593143463,
+      "learning_rate": 0.00011347114622258612,
+      "loss": 1.2957,
+      "step": 95
+    },
+    {
+      "epoch": 1.9387755102040818,
+      "grad_norm": 0.2590245306491852,
+      "learning_rate": 0.00011179567171508463,
+      "loss": 1.2809,
+      "step": 96
+    },
+    {
+      "epoch": 1.9591836734693877,
+      "grad_norm": 0.2235420197248459,
+      "learning_rate": 0.00011011683219874323,
+      "loss": 1.2784,
+      "step": 97
+    },
+    {
+      "epoch": 1.9795918367346939,
+      "grad_norm": 0.285740464925766,
+      "learning_rate": 0.00010843510660430447,
+      "loss": 1.309,
+      "step": 98
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.20554350316524506,
+      "learning_rate": 0.00010675097468583652,
+      "loss": 1.273,
+      "step": 99
+    },
+    {
+      "epoch": 2.020408163265306,
+      "grad_norm": 0.24468418955802917,
+      "learning_rate": 0.00010506491688387127,
+      "loss": 1.2833,
+      "step": 100
+    },
+    {
+      "epoch": 2.020408163265306,
+      "grad_norm": 0.21553528308868408,
+      "learning_rate": 0.00010337741418834684,
+      "loss": 1.2669,
+      "step": 101
+    },
+    {
+      "epoch": 2.0408163265306123,
+      "grad_norm": 0.22015659511089325,
+      "learning_rate": 0.0001016889480013931,
+      "loss": 1.2795,
+      "step": 102
+    },
+    {
+      "epoch": 2.061224489795918,
+      "grad_norm": 0.2028799206018448,
+      "learning_rate": 0.0001,
+      "loss": 1.2584,
+      "step": 103
+    },
+    {
+      "epoch": 2.0816326530612246,
+      "grad_norm": 0.23474323749542236,
+      "learning_rate": 9.83110519986069e-05,
+      "loss": 1.2761,
+      "step": 104
+    },
+    {
+      "epoch": 2.0816326530612246,
+      "eval_loss": 1.2696796655654907,
+      "eval_runtime": 270.6586,
+      "eval_samples_per_second": 6.263,
+      "eval_steps_per_second": 3.133,
+      "step": 104
+    },
+    {
+      "epoch": 2.1020408163265305,
+      "grad_norm": 0.21070216596126556,
+      "learning_rate": 9.662258581165319e-05,
+      "loss": 1.2808,
+      "step": 105
+    },
+    {
+      "epoch": 2.122448979591837,
+      "grad_norm": 0.21867221593856812,
+      "learning_rate": 9.493508311612874e-05,
+      "loss": 1.2873,
+      "step": 106
+    },
+    {
+      "epoch": 2.142857142857143,
+      "grad_norm": 0.21630822122097015,
+      "learning_rate": 9.324902531416349e-05,
+      "loss": 1.2527,
+      "step": 107
+    },
+    {
+      "epoch": 2.163265306122449,
+      "grad_norm": 0.2134082019329071,
+      "learning_rate": 9.156489339569554e-05,
+      "loss": 1.2755,
+      "step": 108
+    },
+    {
+      "epoch": 2.183673469387755,
+      "grad_norm": 0.22310714423656464,
+      "learning_rate": 8.98831678012568e-05,
+      "loss": 1.2512,
+      "step": 109
+    },
+    {
+      "epoch": 2.204081632653061,
+      "grad_norm": 0.2365124374628067,
+      "learning_rate": 8.820432828491542e-05,
+      "loss": 1.2725,
+      "step": 110
+    },
+    {
+      "epoch": 2.2244897959183674,
+      "grad_norm": 0.2086496651172638,
+      "learning_rate": 8.652885377741393e-05,
+      "loss": 1.2488,
+      "step": 111
+    },
+    {
+      "epoch": 2.2448979591836733,
+      "grad_norm": 0.20848101377487183,
+      "learning_rate": 8.485722224954237e-05,
+      "loss": 1.2793,
+      "step": 112
+    },
+    {
+      "epoch": 2.2653061224489797,
+      "grad_norm": 0.20784686505794525,
+      "learning_rate": 8.31899105757852e-05,
+      "loss": 1.2564,
+      "step": 113
+    },
+    {
+      "epoch": 2.2857142857142856,
+      "grad_norm": 0.21896174550056458,
+      "learning_rate": 8.15273943982811e-05,
+      "loss": 1.2515,
+      "step": 114
+    },
+    {
+      "epoch": 2.306122448979592,
+      "grad_norm": 0.21367855370044708,
+      "learning_rate": 7.987014799113397e-05,
+      "loss": 1.248,
+      "step": 115
+    },
+    {
+      "epoch": 2.326530612244898,
+      "grad_norm": 0.20891636610031128,
+      "learning_rate": 7.821864412511485e-05,
+      "loss": 1.2753,
+      "step": 116
+    },
+    {
+      "epoch": 2.3469387755102042,
+      "grad_norm": 0.2092975378036499,
+      "learning_rate": 7.65733539327918e-05,
+      "loss": 1.2509,
+      "step": 117
+    },
+    {
+      "epoch": 2.3469387755102042,
+      "eval_loss": 1.258699655532837,
+      "eval_runtime": 270.5384,
+      "eval_samples_per_second": 6.265,
+      "eval_steps_per_second": 3.134,
+      "step": 117
+    },
+    {
+      "epoch": 2.36734693877551,
+      "grad_norm": 0.1905972808599472,
+      "learning_rate": 7.493474677412794e-05,
+      "loss": 1.2516,
+      "step": 118
+    },
+    {
+      "epoch": 2.387755102040816,
+      "grad_norm": 0.19716158509254456,
+      "learning_rate": 7.330329010258483e-05,
+      "loss": 1.2665,
+      "step": 119
+    },
+    {
+      "epoch": 2.4081632653061225,
+      "grad_norm": 0.1953389048576355,
+      "learning_rate": 7.16794493317696e-05,
+      "loss": 1.2661,
+      "step": 120
+    },
+    {
+      "epoch": 2.4285714285714284,
+      "grad_norm": 0.1990067958831787,
+      "learning_rate": 7.006368770266421e-05,
+      "loss": 1.2619,
+      "step": 121
+    },
+    {
+      "epoch": 2.4489795918367347,
+      "grad_norm": 0.1954919546842575,
+      "learning_rate": 6.845646615147445e-05,
+      "loss": 1.2736,
+      "step": 122
+    },
+    {
+      "epoch": 2.4693877551020407,
+      "grad_norm": 0.18382853269577026,
+      "learning_rate": 6.685824317813643e-05,
+      "loss": 1.2732,
+      "step": 123
+    },
+    {
+      "epoch": 2.489795918367347,
+      "grad_norm": 0.18729491531848907,
+      "learning_rate": 6.526947471551798e-05,
+      "loss": 1.2509,
+      "step": 124
+    },
+    {
+      "epoch": 2.510204081632653,
+      "grad_norm": 0.2034740000963211,
+      "learning_rate": 6.369061399935255e-05,
+      "loss": 1.2829,
+      "step": 125
+    },
+    {
+      "epoch": 2.5306122448979593,
+      "grad_norm": 0.1952620893716812,
+      "learning_rate": 6.21221114389424e-05,
+      "loss": 1.2689,
+      "step": 126
+    },
+    {
+      "epoch": 2.5510204081632653,
+      "grad_norm": 0.1986168622970581,
+      "learning_rate": 6.0564414488668165e-05,
+      "loss": 1.2644,
+      "step": 127
+    },
+    {
+      "epoch": 2.571428571428571,
+      "grad_norm": 0.19526751339435577,
+      "learning_rate": 5.901796752034128e-05,
+      "loss": 1.265,
+      "step": 128
+    },
+    {
+      "epoch": 2.5918367346938775,
+      "grad_norm": 0.195367693901062,
+      "learning_rate": 5.748321169643596e-05,
+      "loss": 1.2782,
+      "step": 129
+    },
+    {
+      "epoch": 2.612244897959184,
+      "grad_norm": 0.18351928889751434,
+      "learning_rate": 5.596058484423656e-05,
+      "loss": 1.2884,
+      "step": 130
+    },
+    {
+      "epoch": 2.612244897959184,
+      "eval_loss": 1.2471545934677124,
+      "eval_runtime": 270.4953,
+      "eval_samples_per_second": 6.266,
+      "eval_steps_per_second": 3.135,
+      "step": 130
+    },
+    {
+      "epoch": 2.63265306122449,
+      "grad_norm": 0.2015760987997055,
+      "learning_rate": 5.44505213309366e-05,
+      "loss": 1.2536,
+      "step": 131
+    },
+    {
+      "epoch": 2.6530612244897958,
+      "grad_norm": 0.1734190732240677,
+      "learning_rate": 5.2953451939724454e-05,
+      "loss": 1.2628,
+      "step": 132
+    },
+    {
+      "epoch": 2.673469387755102,
+      "grad_norm": 0.214066281914711,
+      "learning_rate": 5.146980374689192e-05,
+      "loss": 1.2543,
+      "step": 133
+    },
+    {
+      "epoch": 2.693877551020408,
+      "grad_norm": 0.17507924139499664,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 1.2665,
+      "step": 134
+    },
+    {
+      "epoch": 2.7142857142857144,
+      "grad_norm": 0.1778109222650528,
+      "learning_rate": 4.854445999713715e-05,
+      "loss": 1.2789,
+      "step": 135
+    },
+    {
+      "epoch": 2.7346938775510203,
+      "grad_norm": 0.1856827288866043,
+      "learning_rate": 4.710359896730379e-05,
+      "loss": 1.2481,
+      "step": 136
+    },
+    {
+      "epoch": 2.7551020408163263,
+      "grad_norm": 0.17856694757938385,
+      "learning_rate": 4.567782795195816e-05,
+      "loss": 1.2732,
+      "step": 137
+    },
+    {
+      "epoch": 2.7755102040816326,
+      "grad_norm": 0.21598489582538605,
+      "learning_rate": 4.426755368775637e-05,
+      "loss": 1.2525,
+      "step": 138
+    },
+    {
+      "epoch": 2.795918367346939,
+      "grad_norm": 0.17308436334133148,
+      "learning_rate": 4.287317849052075e-05,
+      "loss": 1.2665,
+      "step": 139
+    },
+    {
+      "epoch": 2.816326530612245,
+      "grad_norm": 0.19207212328910828,
+      "learning_rate": 4.149510014046922e-05,
+      "loss": 1.2681,
+      "step": 140
+    },
+    {
+      "epoch": 2.836734693877551,
+      "grad_norm": 0.19626958668231964,
+      "learning_rate": 4.013371176873849e-05,
+      "loss": 1.2727,
+      "step": 141
+    },
+    {
+      "epoch": 2.857142857142857,
+      "grad_norm": 0.1986483484506607,
+      "learning_rate": 3.878940174523371e-05,
+      "loss": 1.2414,
+      "step": 142
+    },
+    {
+      "epoch": 2.877551020408163,
+      "grad_norm": 0.19369089603424072,
+      "learning_rate": 3.746255356783632e-05,
+      "loss": 1.254,
+      "step": 143
+    },
+    {
+      "epoch": 2.877551020408163,
+      "eval_loss": 1.2410293817520142,
+      "eval_runtime": 270.6762,
+      "eval_samples_per_second": 6.262,
+      "eval_steps_per_second": 3.133,
+      "step": 143
+    },
+    {
+      "epoch": 2.8979591836734695,
+      "grad_norm": 0.20910531282424927,
+      "learning_rate": 3.615354575300166e-05,
+      "loss": 1.2541,
+      "step": 144
+    },
+    {
+      "epoch": 2.9183673469387754,
+      "grad_norm": 0.19536806643009186,
+      "learning_rate": 3.4862751727777797e-05,
+      "loss": 1.2517,
+      "step": 145
+    },
+    {
+      "epoch": 2.938775510204082,
+      "grad_norm": 0.18630966544151306,
+      "learning_rate": 3.3590539723276083e-05,
+      "loss": 1.2473,
+      "step": 146
+    },
+    {
+      "epoch": 2.9591836734693877,
+      "grad_norm": 0.1874723732471466,
+      "learning_rate": 3.233727266962425e-05,
+      "loss": 1.244,
+      "step": 147
+    },
+    {
+      "epoch": 2.979591836734694,
+      "grad_norm": 0.1764463186264038,
+      "learning_rate": 3.110330809243134e-05,
+      "loss": 1.2465,
+      "step": 148
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 0.16570010781288147,
+      "learning_rate": 2.9888998010794743e-05,
+      "loss": 1.2443,
+      "step": 149
+    },
+    {
+      "epoch": 3.020408163265306,
+      "grad_norm": 0.18820856511592865,
+      "learning_rate": 2.869468883687798e-05,
+      "loss": 1.2694,
+      "step": 150
+    },
+    {
+      "epoch": 3.020408163265306,
+      "grad_norm": 0.2009415626525879,
+      "learning_rate": 2.7520721277088024e-05,
+      "loss": 1.2185,
+      "step": 151
+    },
+    {
+      "epoch": 3.0408163265306123,
+      "grad_norm": 0.1824546605348587,
+      "learning_rate": 2.6367430234880284e-05,
+      "loss": 1.2222,
+      "step": 152
+    },
+    {
+      "epoch": 3.061224489795918,
+      "grad_norm": 0.180531844496727,
+      "learning_rate": 2.523514471521913e-05,
+      "loss": 1.2592,
+      "step": 153
+    },
+    {
+      "epoch": 3.0816326530612246,
+      "grad_norm": 0.17422904074192047,
+      "learning_rate": 2.4124187730720917e-05,
+      "loss": 1.2429,
+      "step": 154
+    },
+    {
+      "epoch": 3.1020408163265305,
+      "grad_norm": 0.17531636357307434,
+      "learning_rate": 2.3034876209506772e-05,
+      "loss": 1.2459,
+      "step": 155
+    },
+    {
+      "epoch": 3.122448979591837,
+      "grad_norm": 0.17256909608840942,
+      "learning_rate": 2.1967520904790827e-05,
+      "loss": 1.2523,
+      "step": 156
+    },
+    {
+      "epoch": 3.122448979591837,
+      "eval_loss": 1.240277886390686,
+      "eval_runtime": 270.7279,
+      "eval_samples_per_second": 6.261,
+      "eval_steps_per_second": 3.132,
+      "step": 156
+    },
+    {
+      "epoch": 3.142857142857143,
+      "grad_norm": 0.17711801826953888,
+      "learning_rate": 2.092242630623016e-05,
+      "loss": 1.2416,
+      "step": 157
+    },
+    {
+      "epoch": 3.163265306122449,
+      "grad_norm": 0.1642543524503708,
+      "learning_rate": 1.9899890553061562e-05,
+      "loss": 1.2563,
+      "step": 158
+    },
+    {
+      "epoch": 3.183673469387755,
+      "grad_norm": 0.17609795928001404,
+      "learning_rate": 1.8900205349049904e-05,
+      "loss": 1.2406,
+      "step": 159
+    },
+    {
+      "epoch": 3.204081632653061,
+      "grad_norm": 0.18534283339977264,
+      "learning_rate": 1.7923655879272393e-05,
+      "loss": 1.2522,
+      "step": 160
+    },
+    {
+      "epoch": 3.2244897959183674,
+      "grad_norm": 0.17926208674907684,
+      "learning_rate": 1.6970520728762375e-05,
+      "loss": 1.2315,
+      "step": 161
+    },
+    {
+      "epoch": 3.2448979591836733,
+      "grad_norm": 0.18245543539524078,
+      "learning_rate": 1.60410718030361e-05,
+      "loss": 1.2493,
+      "step": 162
+    },
+    {
+      "epoch": 3.2653061224489797,
+      "grad_norm": 0.16576482355594635,
+      "learning_rate": 1.5135574250524897e-05,
+      "loss": 1.2633,
+      "step": 163
+    },
+    {
+      "epoch": 3.2857142857142856,
+      "grad_norm": 0.1768399477005005,
+      "learning_rate": 1.425428638693489e-05,
+      "loss": 1.2399,
+      "step": 164
+    },
+    {
+      "epoch": 3.306122448979592,
+      "grad_norm": 0.17402540147304535,
+      "learning_rate": 1.339745962155613e-05,
+      "loss": 1.2574,
+      "step": 165
+    },
+    {
+      "epoch": 3.326530612244898,
+      "grad_norm": 0.17550399899482727,
+      "learning_rate": 1.2565338385541792e-05,
+      "loss": 1.2429,
+      "step": 166
+    },
+    {
+      "epoch": 3.3469387755102042,
+      "grad_norm": 0.18776686489582062,
+      "learning_rate": 1.1758160062178093e-05,
+      "loss": 1.2378,
+      "step": 167
+    },
+    {
+      "epoch": 3.36734693877551,
+      "grad_norm": 0.1816324144601822,
+      "learning_rate": 1.097615491916485e-05,
+      "loss": 1.2503,
+      "step": 168
+    },
+    {
+      "epoch": 3.387755102040816,
+      "grad_norm": 0.17802877724170685,
+      "learning_rate": 1.0219546042925843e-05,
+      "loss": 1.2468,
+      "step": 169
+    },
+    {
+      "epoch": 3.387755102040816,
+      "eval_loss": 1.2385426759719849,
+      "eval_runtime": 270.6389,
+      "eval_samples_per_second": 6.263,
+      "eval_steps_per_second": 3.133,
+      "step": 169
+    },
+    {
+      "epoch": 3.4081632653061225,
+      "grad_norm": 0.1731177568435669,
+      "learning_rate": 9.488549274967872e-06,
+      "loss": 1.2431,
+      "step": 170
+    },
+    {
+      "epoch": 3.4285714285714284,
+      "grad_norm": 0.16203820705413818,
+      "learning_rate": 8.783373150306661e-06,
+      "loss": 1.2394,
+      "step": 171
+    },
+    {
+      "epoch": 3.4489795918367347,
+      "grad_norm": 0.1603914201259613,
+      "learning_rate": 8.10421883797694e-06,
+      "loss": 1.2317,
+      "step": 172
+    },
+    {
+      "epoch": 3.4693877551020407,
+      "grad_norm": 0.16672447323799133,
+      "learning_rate": 7.4512800836440525e-06,
+      "loss": 1.2382,
+      "step": 173
+    },
+    {
+      "epoch": 3.489795918367347,
+      "grad_norm": 0.16903318464756012,
+      "learning_rate": 6.824743154333157e-06,
+      "loss": 1.2406,
+      "step": 174
+    },
+    {
+      "epoch": 3.510204081632653,
+      "grad_norm": 0.16718582808971405,
+      "learning_rate": 6.22478678529197e-06,
+      "loss": 1.2253,
+      "step": 175
+    },
+    {
+      "epoch": 3.5306122448979593,
+      "grad_norm": 0.16773243248462677,
+      "learning_rate": 5.651582129001986e-06,
+      "loss": 1.2545,
+      "step": 176
+    },
+    {
+      "epoch": 3.5510204081632653,
+      "grad_norm": 0.16658060252666473,
+      "learning_rate": 5.105292706353093e-06,
+      "loss": 1.2329,
+      "step": 177
+    },
+    {
+      "epoch": 3.571428571428571,
+      "grad_norm": 0.16760899126529694,
+      "learning_rate": 4.586074359995119e-06,
+      "loss": 1.2218,
+      "step": 178
+    },
+    {
+      "epoch": 3.5918367346938775,
+      "grad_norm": 0.17462213337421417,
+      "learning_rate": 4.094075209879788e-06,
+      "loss": 1.236,
+      "step": 179
+    },
+    {
+      "epoch": 3.612244897959184,
+      "grad_norm": 0.16253593564033508,
+      "learning_rate": 3.6294356110059157e-06,
+      "loss": 1.2518,
+      "step": 180
+    },
+    {
+      "epoch": 3.63265306122449,
+      "grad_norm": 0.16653120517730713,
+      "learning_rate": 3.1922881133795825e-06,
+      "loss": 1.2171,
+      "step": 181
+    },
+    {
+      "epoch": 3.6530612244897958,
+      "grad_norm": 0.1757594645023346,
+      "learning_rate": 2.7827574242009437e-06,
+      "loss": 1.2476,
+      "step": 182
+    },
+    {
+      "epoch": 3.6530612244897958,
+      "eval_loss": 1.237037181854248,
+      "eval_runtime": 270.3815,
+      "eval_samples_per_second": 6.269,
+      "eval_steps_per_second": 3.136,
+      "step": 182
+    },
+    {
+      "epoch": 3.673469387755102,
+      "grad_norm": 0.1665186882019043,
+      "learning_rate": 2.4009603722884742e-06,
+      "loss": 1.2497,
+      "step": 183
+    },
+    {
+      "epoch": 3.693877551020408,
+      "grad_norm": 0.17469817399978638,
+      "learning_rate": 2.0470058747505516e-06,
+      "loss": 1.2426,
+      "step": 184
+    },
+    {
+      "epoch": 3.7142857142857144,
+      "grad_norm": 0.17130160331726074,
+      "learning_rate": 1.7209949059142083e-06,
+      "loss": 1.2255,
+      "step": 185
+    },
+    {
+      "epoch": 3.7346938775510203,
+      "grad_norm": 0.1677573323249817,
+      "learning_rate": 1.4230204685196203e-06,
+      "loss": 1.2643,
+      "step": 186
+    },
+    {
+      "epoch": 3.7551020408163263,
+      "grad_norm": 0.16778886318206787,
+      "learning_rate": 1.1531675671888619e-06,
+      "loss": 1.234,
+      "step": 187
+    },
+    {
+      "epoch": 3.7755102040816326,
+      "grad_norm": 0.16397559642791748,
+      "learning_rate": 9.11513184176116e-07,
+      "loss": 1.2509,
+      "step": 188
+    },
+    {
+      "epoch": 3.795918367346939,
+      "grad_norm": 0.16539420187473297,
+      "learning_rate": 6.981262574066394e-07,
+      "loss": 1.2425,
+      "step": 189
+    },
+    {
+      "epoch": 3.816326530612245,
+      "grad_norm": 0.18255014717578888,
+      "learning_rate": 5.130676608104845e-07,
+      "loss": 1.2628,
+      "step": 190
+    },
+    {
+      "epoch": 3.836734693877551,
+      "grad_norm": 0.16024163365364075,
+      "learning_rate": 3.56390186956701e-07,
+      "loss": 1.2331,
+      "step": 191
+    },
+    {
+      "epoch": 3.857142857142857,
+      "grad_norm": 0.17575234174728394,
+      "learning_rate": 2.2813853199292746e-07,
+      "loss": 1.2497,
+      "step": 192
+    },
+    {
+      "epoch": 3.877551020408163,
+      "grad_norm": 0.1590609848499298,
+      "learning_rate": 1.2834928289472416e-07,
+      "loss": 1.2436,
+      "step": 193
+    },
+    {
+      "epoch": 3.8979591836734695,
+      "grad_norm": 0.17772971093654633,
+      "learning_rate": 5.705090702819993e-08,
+      "loss": 1.2361,
+      "step": 194
+    },
+    {
+      "epoch": 3.9183673469387754,
+      "grad_norm": 0.15970654785633087,
+      "learning_rate": 1.426374402901942e-08,
+      "loss": 1.2366,
+      "step": 195
+    },
+    {
+      "epoch": 3.9183673469387754,
+      "eval_loss": 1.2375136613845825,
+      "eval_runtime": 270.7418,
+      "eval_samples_per_second": 6.261,
+      "eval_steps_per_second": 3.132,
+      "step": 195
+    },
+    {
+      "epoch": 3.938775510204082,
+      "grad_norm": 0.15187527239322662,
+      "learning_rate": 0.0,
+      "loss": 1.2409,
+      "step": 196
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 196,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 49,
+  "total_flos": 4.083740321198899e+16,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-196/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a9cc4ff61a4799ac22d5c627a0169a20cc75619ae1b5871f2d114f95284d87a6
+size 5816

checkpoint-49/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+library_name: peft
+base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.10.0

checkpoint-49/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
+  "bias": "none",
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "o_proj",
+    "down_proj",
+    "k_proj",
+    "q_proj",
+    "gate_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-49/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:15a7afbbb6db02fdac7ffe868d42729e1c9515f835763d3b9551db4ae31e3529
+size 100966336

checkpoint-49/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9db4145fa287fcc2dc98bac341ab537efce6a4407796361cd24ac6b2176f6a70
+size 50916644

checkpoint-49/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:41e595b32f221472ac195c50986dfcd13bac01a4909d487f497aaa38e078d0c2
+size 14244

checkpoint-49/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6e5af14094f757ccb041613325b6c93fe808050ec47f3a4ec285ab4a0e229950
+size 1064

checkpoint-49/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-49/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

checkpoint-49/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false,
+  "use_fast": true
+}

checkpoint-49/trainer_state.json ADDED Viewed

	@@ -0,0 +1,396 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 13,
+  "global_step": 49,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.02040816326530612,
+      "grad_norm": 0.7881951332092285,
+      "learning_rate": 2e-05,
+      "loss": 2.7509,
+      "step": 1
+    },
+    {
+      "epoch": 0.02040816326530612,
+      "eval_loss": 2.6902382373809814,
+      "eval_runtime": 269.5606,
+      "eval_samples_per_second": 6.288,
+      "eval_steps_per_second": 3.146,
+      "step": 1
+    },
+    {
+      "epoch": 0.04081632653061224,
+      "grad_norm": 0.789082407951355,
+      "learning_rate": 4e-05,
+      "loss": 2.7449,
+      "step": 2
+    },
+    {
+      "epoch": 0.061224489795918366,
+      "grad_norm": 0.7354114055633545,
+      "learning_rate": 6e-05,
+      "loss": 2.7164,
+      "step": 3
+    },
+    {
+      "epoch": 0.08163265306122448,
+      "grad_norm": 0.7292255759239197,
+      "learning_rate": 8e-05,
+      "loss": 2.7174,
+      "step": 4
+    },
+    {
+      "epoch": 0.10204081632653061,
+      "grad_norm": 0.6898028254508972,
+      "learning_rate": 0.0001,
+      "loss": 2.6891,
+      "step": 5
+    },
+    {
+      "epoch": 0.12244897959183673,
+      "grad_norm": 0.6861400604248047,
+      "learning_rate": 0.00012,
+      "loss": 2.6545,
+      "step": 6
+    },
+    {
+      "epoch": 0.14285714285714285,
+      "grad_norm": 0.7510350346565247,
+      "learning_rate": 0.00014,
+      "loss": 2.5656,
+      "step": 7
+    },
+    {
+      "epoch": 0.16326530612244897,
+      "grad_norm": 0.8011165261268616,
+      "learning_rate": 0.00016,
+      "loss": 2.4519,
+      "step": 8
+    },
+    {
+      "epoch": 0.1836734693877551,
+      "grad_norm": 0.8624005317687988,
+      "learning_rate": 0.00018,
+      "loss": 2.3178,
+      "step": 9
+    },
+    {
+      "epoch": 0.20408163265306123,
+      "grad_norm": 0.8004987835884094,
+      "learning_rate": 0.0002,
+      "loss": 2.1783,
+      "step": 10
+    },
+    {
+      "epoch": 0.22448979591836735,
+      "grad_norm": 0.6362400054931641,
+      "learning_rate": 0.000199985736255971,
+      "loss": 2.0252,
+      "step": 11
+    },
+    {
+      "epoch": 0.24489795918367346,
+      "grad_norm": 0.7930936217308044,
+      "learning_rate": 0.0001999429490929718,
+      "loss": 1.8839,
+      "step": 12
+    },
+    {
+      "epoch": 0.2653061224489796,
+      "grad_norm": 0.5149843096733093,
+      "learning_rate": 0.00019987165071710527,
+      "loss": 1.8064,
+      "step": 13
+    },
+    {
+      "epoch": 0.2653061224489796,
+      "eval_loss": 1.6734941005706787,
+      "eval_runtime": 271.2615,
+      "eval_samples_per_second": 6.249,
+      "eval_steps_per_second": 3.126,
+      "step": 13
+    },
+    {
+      "epoch": 0.2857142857142857,
+      "grad_norm": 0.42121434211730957,
+      "learning_rate": 0.00019977186146800707,
+      "loss": 1.7922,
+      "step": 14
+    },
+    {
+      "epoch": 0.30612244897959184,
+      "grad_norm": 0.3523242771625519,
+      "learning_rate": 0.0001996436098130433,
+      "loss": 1.7711,
+      "step": 15
+    },
+    {
+      "epoch": 0.32653061224489793,
+      "grad_norm": 0.3384595215320587,
+      "learning_rate": 0.00019948693233918952,
+      "loss": 1.7152,
+      "step": 16
+    },
+    {
+      "epoch": 0.3469387755102041,
+      "grad_norm": 0.34942421317100525,
+      "learning_rate": 0.00019930187374259337,
+      "loss": 1.7112,
+      "step": 17
+    },
+    {
+      "epoch": 0.3673469387755102,
+      "grad_norm": 0.31712639331817627,
+      "learning_rate": 0.00019908848681582391,
+      "loss": 1.7059,
+      "step": 18
+    },
+    {
+      "epoch": 0.3877551020408163,
+      "grad_norm": 0.2875436842441559,
+      "learning_rate": 0.00019884683243281116,
+      "loss": 1.6468,
+      "step": 19
+    },
+    {
+      "epoch": 0.40816326530612246,
+      "grad_norm": 0.24433130025863647,
+      "learning_rate": 0.00019857697953148037,
+      "loss": 1.6408,
+      "step": 20
+    },
+    {
+      "epoch": 0.42857142857142855,
+      "grad_norm": 0.21414674818515778,
+      "learning_rate": 0.00019827900509408581,
+      "loss": 1.616,
+      "step": 21
+    },
+    {
+      "epoch": 0.4489795918367347,
+      "grad_norm": 0.21537622809410095,
+      "learning_rate": 0.00019795299412524945,
+      "loss": 1.609,
+      "step": 22
+    },
+    {
+      "epoch": 0.46938775510204084,
+      "grad_norm": 0.2432074397802353,
+      "learning_rate": 0.00019759903962771156,
+      "loss": 1.6066,
+      "step": 23
+    },
+    {
+      "epoch": 0.4897959183673469,
+      "grad_norm": 0.2359839379787445,
+      "learning_rate": 0.00019721724257579907,
+      "loss": 1.5851,
+      "step": 24
+    },
+    {
+      "epoch": 0.5102040816326531,
+      "grad_norm": 0.22065888345241547,
+      "learning_rate": 0.00019680771188662044,
+      "loss": 1.5739,
+      "step": 25
+    },
+    {
+      "epoch": 0.5306122448979592,
+      "grad_norm": 0.20339132845401764,
+      "learning_rate": 0.0001963705643889941,
+      "loss": 1.5513,
+      "step": 26
+    },
+    {
+      "epoch": 0.5306122448979592,
+      "eval_loss": 1.4832030534744263,
+      "eval_runtime": 271.2449,
+      "eval_samples_per_second": 6.249,
+      "eval_steps_per_second": 3.126,
+      "step": 26
+    },
+    {
+      "epoch": 0.5510204081632653,
+      "grad_norm": 0.18875224888324738,
+      "learning_rate": 0.00019590592479012023,
+      "loss": 1.5378,
+      "step": 27
+    },
+    {
+      "epoch": 0.5714285714285714,
+      "grad_norm": 0.18564417958259583,
+      "learning_rate": 0.00019541392564000488,
+      "loss": 1.5212,
+      "step": 28
+    },
+    {
+      "epoch": 0.5918367346938775,
+      "grad_norm": 0.16226942837238312,
+      "learning_rate": 0.00019489470729364692,
+      "loss": 1.5391,
+      "step": 29
+    },
+    {
+      "epoch": 0.6122448979591837,
+      "grad_norm": 0.15650039911270142,
+      "learning_rate": 0.00019434841787099803,
+      "loss": 1.511,
+      "step": 30
+    },
+    {
+      "epoch": 0.6326530612244898,
+      "grad_norm": 0.15976540744304657,
+      "learning_rate": 0.00019377521321470805,
+      "loss": 1.5119,
+      "step": 31
+    },
+    {
+      "epoch": 0.6530612244897959,
+      "grad_norm": 0.16409288346767426,
+      "learning_rate": 0.00019317525684566685,
+      "loss": 1.4909,
+      "step": 32
+    },
+    {
+      "epoch": 0.673469387755102,
+      "grad_norm": 0.15468019247055054,
+      "learning_rate": 0.00019254871991635598,
+      "loss": 1.4951,
+      "step": 33
+    },
+    {
+      "epoch": 0.6938775510204082,
+      "grad_norm": 0.1462036371231079,
+      "learning_rate": 0.00019189578116202307,
+      "loss": 1.4643,
+      "step": 34
+    },
+    {
+      "epoch": 0.7142857142857143,
+      "grad_norm": 0.1541963368654251,
+      "learning_rate": 0.00019121662684969335,
+      "loss": 1.5159,
+      "step": 35
+    },
+    {
+      "epoch": 0.7346938775510204,
+      "grad_norm": 0.14798064529895782,
+      "learning_rate": 0.00019051145072503215,
+      "loss": 1.4741,
+      "step": 36
+    },
+    {
+      "epoch": 0.7551020408163265,
+      "grad_norm": 0.13914817571640015,
+      "learning_rate": 0.00018978045395707418,
+      "loss": 1.4788,
+      "step": 37
+    },
+    {
+      "epoch": 0.7755102040816326,
+      "grad_norm": 0.15608824789524078,
+      "learning_rate": 0.00018902384508083517,
+      "loss": 1.4687,
+      "step": 38
+    },
+    {
+      "epoch": 0.7959183673469388,
+      "grad_norm": 0.14460116624832153,
+      "learning_rate": 0.00018824183993782192,
+      "loss": 1.482,
+      "step": 39
+    },
+    {
+      "epoch": 0.7959183673469388,
+      "eval_loss": 1.411073088645935,
+      "eval_runtime": 271.292,
+      "eval_samples_per_second": 6.248,
+      "eval_steps_per_second": 3.126,
+      "step": 39
+    },
+    {
+      "epoch": 0.8163265306122449,
+      "grad_norm": 0.15740551054477692,
+      "learning_rate": 0.00018743466161445823,
+      "loss": 1.4486,
+      "step": 40
+    },
+    {
+      "epoch": 0.8367346938775511,
+      "grad_norm": 0.14149661362171173,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 1.4353,
+      "step": 41
+    },
+    {
+      "epoch": 0.8571428571428571,
+      "grad_norm": 0.14034292101860046,
+      "learning_rate": 0.0001857457136130651,
+      "loss": 1.4523,
+      "step": 42
+    },
+    {
+      "epoch": 0.8775510204081632,
+      "grad_norm": 0.1487722396850586,
+      "learning_rate": 0.00018486442574947511,
+      "loss": 1.4095,
+      "step": 43
+    },
+    {
+      "epoch": 0.8979591836734694,
+      "grad_norm": 0.17400234937667847,
+      "learning_rate": 0.00018395892819696389,
+      "loss": 1.4414,
+      "step": 44
+    },
+    {
+      "epoch": 0.9183673469387755,
+      "grad_norm": 0.1741325408220291,
+      "learning_rate": 0.00018302947927123766,
+      "loss": 1.4379,
+      "step": 45
+    },
+    {
+      "epoch": 0.9387755102040817,
+      "grad_norm": 0.15319454669952393,
+      "learning_rate": 0.00018207634412072764,
+      "loss": 1.405,
+      "step": 46
+    },
+    {
+      "epoch": 0.9591836734693877,
+      "grad_norm": 0.15876264870166779,
+      "learning_rate": 0.00018109979465095013,
+      "loss": 1.4122,
+      "step": 47
+    },
+    {
+      "epoch": 0.9795918367346939,
+      "grad_norm": 0.17120805382728577,
+      "learning_rate": 0.00018010010944693848,
+      "loss": 1.4132,
+      "step": 48
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.1436116099357605,
+      "learning_rate": 0.00017907757369376985,
+      "loss": 1.416,
+      "step": 49
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 196,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 49,
+  "total_flos": 1.0209350802997248e+16,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-49/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a9cc4ff61a4799ac22d5c627a0169a20cc75619ae1b5871f2d114f95284d87a6
+size 5816

checkpoint-98/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+library_name: peft
+base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.10.0

checkpoint-98/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
+  "bias": "none",
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "o_proj",
+    "down_proj",
+    "k_proj",
+    "q_proj",
+    "gate_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-98/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:88f74a76e06a6e5698ca16a682f4fa5d7e5c10182d165fe6c9327116444b10d0
+size 100966336

checkpoint-98/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5c01f653a0ce9ea304a86d075b21cd51ea729659b91629c555eec65181dd1818
+size 50916644

checkpoint-98/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff339d3bf5bb702320fd9a759e0988b159a701364f186575c95d51b72519d7a1
+size 14244

checkpoint-98/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5e31465eabc96d2c0b0dc68386782c8ea3a5771edcba13d0d620c4297cd31957
+size 1064

checkpoint-98/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-98/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

checkpoint-98/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false,
+  "use_fast": true
+}

checkpoint-98/trainer_state.json ADDED Viewed

	@@ -0,0 +1,771 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.9795918367346939,
+  "eval_steps": 13,
+  "global_step": 98,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.02040816326530612,
+      "grad_norm": 0.7881951332092285,
+      "learning_rate": 2e-05,
+      "loss": 2.7509,
+      "step": 1
+    },
+    {
+      "epoch": 0.02040816326530612,
+      "eval_loss": 2.6902382373809814,
+      "eval_runtime": 269.5606,
+      "eval_samples_per_second": 6.288,
+      "eval_steps_per_second": 3.146,
+      "step": 1
+    },
+    {
+      "epoch": 0.04081632653061224,
+      "grad_norm": 0.789082407951355,
+      "learning_rate": 4e-05,
+      "loss": 2.7449,
+      "step": 2
+    },
+    {
+      "epoch": 0.061224489795918366,
+      "grad_norm": 0.7354114055633545,
+      "learning_rate": 6e-05,
+      "loss": 2.7164,
+      "step": 3
+    },
+    {
+      "epoch": 0.08163265306122448,
+      "grad_norm": 0.7292255759239197,
+      "learning_rate": 8e-05,
+      "loss": 2.7174,
+      "step": 4
+    },
+    {
+      "epoch": 0.10204081632653061,
+      "grad_norm": 0.6898028254508972,
+      "learning_rate": 0.0001,
+      "loss": 2.6891,
+      "step": 5
+    },
+    {
+      "epoch": 0.12244897959183673,
+      "grad_norm": 0.6861400604248047,
+      "learning_rate": 0.00012,
+      "loss": 2.6545,
+      "step": 6
+    },
+    {
+      "epoch": 0.14285714285714285,
+      "grad_norm": 0.7510350346565247,
+      "learning_rate": 0.00014,
+      "loss": 2.5656,
+      "step": 7
+    },
+    {
+      "epoch": 0.16326530612244897,
+      "grad_norm": 0.8011165261268616,
+      "learning_rate": 0.00016,
+      "loss": 2.4519,
+      "step": 8
+    },
+    {
+      "epoch": 0.1836734693877551,
+      "grad_norm": 0.8624005317687988,
+      "learning_rate": 0.00018,
+      "loss": 2.3178,
+      "step": 9
+    },
+    {
+      "epoch": 0.20408163265306123,
+      "grad_norm": 0.8004987835884094,
+      "learning_rate": 0.0002,
+      "loss": 2.1783,
+      "step": 10
+    },
+    {
+      "epoch": 0.22448979591836735,
+      "grad_norm": 0.6362400054931641,
+      "learning_rate": 0.000199985736255971,
+      "loss": 2.0252,
+      "step": 11
+    },
+    {
+      "epoch": 0.24489795918367346,
+      "grad_norm": 0.7930936217308044,
+      "learning_rate": 0.0001999429490929718,
+      "loss": 1.8839,
+      "step": 12
+    },
+    {
+      "epoch": 0.2653061224489796,
+      "grad_norm": 0.5149843096733093,
+      "learning_rate": 0.00019987165071710527,
+      "loss": 1.8064,
+      "step": 13
+    },
+    {
+      "epoch": 0.2653061224489796,
+      "eval_loss": 1.6734941005706787,
+      "eval_runtime": 271.2615,
+      "eval_samples_per_second": 6.249,
+      "eval_steps_per_second": 3.126,
+      "step": 13
+    },
+    {
+      "epoch": 0.2857142857142857,
+      "grad_norm": 0.42121434211730957,
+      "learning_rate": 0.00019977186146800707,
+      "loss": 1.7922,
+      "step": 14
+    },
+    {
+      "epoch": 0.30612244897959184,
+      "grad_norm": 0.3523242771625519,
+      "learning_rate": 0.0001996436098130433,
+      "loss": 1.7711,
+      "step": 15
+    },
+    {
+      "epoch": 0.32653061224489793,
+      "grad_norm": 0.3384595215320587,
+      "learning_rate": 0.00019948693233918952,
+      "loss": 1.7152,
+      "step": 16
+    },
+    {
+      "epoch": 0.3469387755102041,
+      "grad_norm": 0.34942421317100525,
+      "learning_rate": 0.00019930187374259337,
+      "loss": 1.7112,
+      "step": 17
+    },
+    {
+      "epoch": 0.3673469387755102,
+      "grad_norm": 0.31712639331817627,
+      "learning_rate": 0.00019908848681582391,
+      "loss": 1.7059,
+      "step": 18
+    },
+    {
+      "epoch": 0.3877551020408163,
+      "grad_norm": 0.2875436842441559,
+      "learning_rate": 0.00019884683243281116,
+      "loss": 1.6468,
+      "step": 19
+    },
+    {
+      "epoch": 0.40816326530612246,
+      "grad_norm": 0.24433130025863647,
+      "learning_rate": 0.00019857697953148037,
+      "loss": 1.6408,
+      "step": 20
+    },
+    {
+      "epoch": 0.42857142857142855,
+      "grad_norm": 0.21414674818515778,
+      "learning_rate": 0.00019827900509408581,
+      "loss": 1.616,
+      "step": 21
+    },
+    {
+      "epoch": 0.4489795918367347,
+      "grad_norm": 0.21537622809410095,
+      "learning_rate": 0.00019795299412524945,
+      "loss": 1.609,
+      "step": 22
+    },
+    {
+      "epoch": 0.46938775510204084,
+      "grad_norm": 0.2432074397802353,
+      "learning_rate": 0.00019759903962771156,
+      "loss": 1.6066,
+      "step": 23
+    },
+    {
+      "epoch": 0.4897959183673469,
+      "grad_norm": 0.2359839379787445,
+      "learning_rate": 0.00019721724257579907,
+      "loss": 1.5851,
+      "step": 24
+    },
+    {
+      "epoch": 0.5102040816326531,
+      "grad_norm": 0.22065888345241547,
+      "learning_rate": 0.00019680771188662044,
+      "loss": 1.5739,
+      "step": 25
+    },
+    {
+      "epoch": 0.5306122448979592,
+      "grad_norm": 0.20339132845401764,
+      "learning_rate": 0.0001963705643889941,
+      "loss": 1.5513,
+      "step": 26
+    },
+    {
+      "epoch": 0.5306122448979592,
+      "eval_loss": 1.4832030534744263,
+      "eval_runtime": 271.2449,
+      "eval_samples_per_second": 6.249,
+      "eval_steps_per_second": 3.126,
+      "step": 26
+    },
+    {
+      "epoch": 0.5510204081632653,
+      "grad_norm": 0.18875224888324738,
+      "learning_rate": 0.00019590592479012023,
+      "loss": 1.5378,
+      "step": 27
+    },
+    {
+      "epoch": 0.5714285714285714,
+      "grad_norm": 0.18564417958259583,
+      "learning_rate": 0.00019541392564000488,
+      "loss": 1.5212,
+      "step": 28
+    },
+    {
+      "epoch": 0.5918367346938775,
+      "grad_norm": 0.16226942837238312,
+      "learning_rate": 0.00019489470729364692,
+      "loss": 1.5391,
+      "step": 29
+    },
+    {
+      "epoch": 0.6122448979591837,
+      "grad_norm": 0.15650039911270142,
+      "learning_rate": 0.00019434841787099803,
+      "loss": 1.511,
+      "step": 30
+    },
+    {
+      "epoch": 0.6326530612244898,
+      "grad_norm": 0.15976540744304657,
+      "learning_rate": 0.00019377521321470805,
+      "loss": 1.5119,
+      "step": 31
+    },
+    {
+      "epoch": 0.6530612244897959,
+      "grad_norm": 0.16409288346767426,
+      "learning_rate": 0.00019317525684566685,
+      "loss": 1.4909,
+      "step": 32
+    },
+    {
+      "epoch": 0.673469387755102,
+      "grad_norm": 0.15468019247055054,
+      "learning_rate": 0.00019254871991635598,
+      "loss": 1.4951,
+      "step": 33
+    },
+    {
+      "epoch": 0.6938775510204082,
+      "grad_norm": 0.1462036371231079,
+      "learning_rate": 0.00019189578116202307,
+      "loss": 1.4643,
+      "step": 34
+    },
+    {
+      "epoch": 0.7142857142857143,
+      "grad_norm": 0.1541963368654251,
+      "learning_rate": 0.00019121662684969335,
+      "loss": 1.5159,
+      "step": 35
+    },
+    {
+      "epoch": 0.7346938775510204,
+      "grad_norm": 0.14798064529895782,
+      "learning_rate": 0.00019051145072503215,
+      "loss": 1.4741,
+      "step": 36
+    },
+    {
+      "epoch": 0.7551020408163265,
+      "grad_norm": 0.13914817571640015,
+      "learning_rate": 0.00018978045395707418,
+      "loss": 1.4788,
+      "step": 37
+    },
+    {
+      "epoch": 0.7755102040816326,
+      "grad_norm": 0.15608824789524078,
+      "learning_rate": 0.00018902384508083517,
+      "loss": 1.4687,
+      "step": 38
+    },
+    {
+      "epoch": 0.7959183673469388,
+      "grad_norm": 0.14460116624832153,
+      "learning_rate": 0.00018824183993782192,
+      "loss": 1.482,
+      "step": 39
+    },
+    {
+      "epoch": 0.7959183673469388,
+      "eval_loss": 1.411073088645935,
+      "eval_runtime": 271.292,
+      "eval_samples_per_second": 6.248,
+      "eval_steps_per_second": 3.126,
+      "step": 39
+    },
+    {
+      "epoch": 0.8163265306122449,
+      "grad_norm": 0.15740551054477692,
+      "learning_rate": 0.00018743466161445823,
+      "loss": 1.4486,
+      "step": 40
+    },
+    {
+      "epoch": 0.8367346938775511,
+      "grad_norm": 0.14149661362171173,
+      "learning_rate": 0.00018660254037844388,
+      "loss": 1.4353,
+      "step": 41
+    },
+    {
+      "epoch": 0.8571428571428571,
+      "grad_norm": 0.14034292101860046,
+      "learning_rate": 0.0001857457136130651,
+      "loss": 1.4523,
+      "step": 42
+    },
+    {
+      "epoch": 0.8775510204081632,
+      "grad_norm": 0.1487722396850586,
+      "learning_rate": 0.00018486442574947511,
+      "loss": 1.4095,
+      "step": 43
+    },
+    {
+      "epoch": 0.8979591836734694,
+      "grad_norm": 0.17400234937667847,
+      "learning_rate": 0.00018395892819696389,
+      "loss": 1.4414,
+      "step": 44
+    },
+    {
+      "epoch": 0.9183673469387755,
+      "grad_norm": 0.1741325408220291,
+      "learning_rate": 0.00018302947927123766,
+      "loss": 1.4379,
+      "step": 45
+    },
+    {
+      "epoch": 0.9387755102040817,
+      "grad_norm": 0.15319454669952393,
+      "learning_rate": 0.00018207634412072764,
+      "loss": 1.405,
+      "step": 46
+    },
+    {
+      "epoch": 0.9591836734693877,
+      "grad_norm": 0.15876264870166779,
+      "learning_rate": 0.00018109979465095013,
+      "loss": 1.4122,
+      "step": 47
+    },
+    {
+      "epoch": 0.9795918367346939,
+      "grad_norm": 0.17120805382728577,
+      "learning_rate": 0.00018010010944693848,
+      "loss": 1.4132,
+      "step": 48
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.1436116099357605,
+      "learning_rate": 0.00017907757369376985,
+      "loss": 1.416,
+      "step": 49
+    },
+    {
+      "epoch": 1.0204081632653061,
+      "grad_norm": 0.1707429438829422,
+      "learning_rate": 0.0001780324790952092,
+      "loss": 1.3913,
+      "step": 50
+    },
+    {
+      "epoch": 1.0204081632653061,
+      "grad_norm": 0.17117524147033691,
+      "learning_rate": 0.00017696512379049325,
+      "loss": 1.3963,
+      "step": 51
+    },
+    {
+      "epoch": 1.0408163265306123,
+      "grad_norm": 0.13410089910030365,
+      "learning_rate": 0.0001758758122692791,
+      "loss": 1.392,
+      "step": 52
+    },
+    {
+      "epoch": 1.0408163265306123,
+      "eval_loss": 1.3676769733428955,
+      "eval_runtime": 270.8566,
+      "eval_samples_per_second": 6.258,
+      "eval_steps_per_second": 3.131,
+      "step": 52
+    },
+    {
+      "epoch": 1.0612244897959184,
+      "grad_norm": 0.18877607583999634,
+      "learning_rate": 0.00017476485528478093,
+      "loss": 1.3854,
+      "step": 53
+    },
+    {
+      "epoch": 1.0816326530612246,
+      "grad_norm": 0.1752927452325821,
+      "learning_rate": 0.00017363256976511972,
+      "loss": 1.3759,
+      "step": 54
+    },
+    {
+      "epoch": 1.1020408163265305,
+      "grad_norm": 0.17180170118808746,
+      "learning_rate": 0.000172479278722912,
+      "loss": 1.3614,
+      "step": 55
+    },
+    {
+      "epoch": 1.1224489795918366,
+      "grad_norm": 0.1640290915966034,
+      "learning_rate": 0.00017130531116312203,
+      "loss": 1.3853,
+      "step": 56
+    },
+    {
+      "epoch": 1.1428571428571428,
+      "grad_norm": 0.2047068476676941,
+      "learning_rate": 0.0001701110019892053,
+      "loss": 1.3699,
+      "step": 57
+    },
+    {
+      "epoch": 1.163265306122449,
+      "grad_norm": 0.1835869997739792,
+      "learning_rate": 0.00016889669190756868,
+      "loss": 1.3403,
+      "step": 58
+    },
+    {
+      "epoch": 1.183673469387755,
+      "grad_norm": 0.16733241081237793,
+      "learning_rate": 0.00016766272733037576,
+      "loss": 1.3609,
+      "step": 59
+    },
+    {
+      "epoch": 1.2040816326530612,
+      "grad_norm": 0.178726926445961,
+      "learning_rate": 0.00016640946027672392,
+      "loss": 1.3651,
+      "step": 60
+    },
+    {
+      "epoch": 1.2244897959183674,
+      "grad_norm": 0.16719630360603333,
+      "learning_rate": 0.00016513724827222227,
+      "loss": 1.3676,
+      "step": 61
+    },
+    {
+      "epoch": 1.2448979591836735,
+      "grad_norm": 0.15999363362789154,
+      "learning_rate": 0.00016384645424699835,
+      "loss": 1.3651,
+      "step": 62
+    },
+    {
+      "epoch": 1.2653061224489797,
+      "grad_norm": 0.1705988198518753,
+      "learning_rate": 0.00016253744643216368,
+      "loss": 1.3757,
+      "step": 63
+    },
+    {
+      "epoch": 1.2857142857142856,
+      "grad_norm": 0.14996370673179626,
+      "learning_rate": 0.0001612105982547663,
+      "loss": 1.3474,
+      "step": 64
+    },
+    {
+      "epoch": 1.306122448979592,
+      "grad_norm": 0.19127260148525238,
+      "learning_rate": 0.0001598662882312615,
+      "loss": 1.3414,
+      "step": 65
+    },
+    {
+      "epoch": 1.306122448979592,
+      "eval_loss": 1.331880807876587,
+      "eval_runtime": 270.8424,
+      "eval_samples_per_second": 6.258,
+      "eval_steps_per_second": 3.131,
+      "step": 65
+    },
+    {
+      "epoch": 1.3265306122448979,
+      "grad_norm": 0.16125527024269104,
+      "learning_rate": 0.00015850489985953076,
+      "loss": 1.3509,
+      "step": 66
+    },
+    {
+      "epoch": 1.346938775510204,
+      "grad_norm": 0.1979473978281021,
+      "learning_rate": 0.00015712682150947923,
+      "loss": 1.3579,
+      "step": 67
+    },
+    {
+      "epoch": 1.3673469387755102,
+      "grad_norm": 0.18317992985248566,
+      "learning_rate": 0.00015573244631224365,
+      "loss": 1.3341,
+      "step": 68
+    },
+    {
+      "epoch": 1.3877551020408163,
+      "grad_norm": 0.1646898239850998,
+      "learning_rate": 0.0001543221720480419,
+      "loss": 1.3361,
+      "step": 69
+    },
+    {
+      "epoch": 1.4081632653061225,
+      "grad_norm": 0.1760271042585373,
+      "learning_rate": 0.00015289640103269625,
+      "loss": 1.358,
+      "step": 70
+    },
+    {
+      "epoch": 1.4285714285714286,
+      "grad_norm": 0.165283203125,
+      "learning_rate": 0.0001514555400028629,
+      "loss": 1.3072,
+      "step": 71
+    },
+    {
+      "epoch": 1.4489795918367347,
+      "grad_norm": 0.1507076472043991,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.3133,
+      "step": 72
+    },
+    {
+      "epoch": 1.469387755102041,
+      "grad_norm": 0.16913647949695587,
+      "learning_rate": 0.00014853019625310813,
+      "loss": 1.3232,
+      "step": 73
+    },
+    {
+      "epoch": 1.489795918367347,
+      "grad_norm": 0.18266479671001434,
+      "learning_rate": 0.0001470465480602756,
+      "loss": 1.3512,
+      "step": 74
+    },
+    {
+      "epoch": 1.510204081632653,
+      "grad_norm": 0.19301828742027283,
+      "learning_rate": 0.0001455494786690634,
+      "loss": 1.3241,
+      "step": 75
+    },
+    {
+      "epoch": 1.5306122448979593,
+      "grad_norm": 0.16109652817249298,
+      "learning_rate": 0.00014403941515576344,
+      "loss": 1.3256,
+      "step": 76
+    },
+    {
+      "epoch": 1.5510204081632653,
+      "grad_norm": 0.17053867876529694,
+      "learning_rate": 0.00014251678830356408,
+      "loss": 1.3162,
+      "step": 77
+    },
+    {
+      "epoch": 1.5714285714285714,
+      "grad_norm": 0.17348544299602509,
+      "learning_rate": 0.00014098203247965875,
+      "loss": 1.3213,
+      "step": 78
+    },
+    {
+      "epoch": 1.5714285714285714,
+      "eval_loss": 1.3028697967529297,
+      "eval_runtime": 270.8095,
+      "eval_samples_per_second": 6.259,
+      "eval_steps_per_second": 3.131,
+      "step": 78
+    },
+    {
+      "epoch": 1.5918367346938775,
+      "grad_norm": 0.1703907549381256,
+      "learning_rate": 0.00013943558551133186,
+      "loss": 1.3073,
+      "step": 79
+    },
+    {
+      "epoch": 1.6122448979591837,
+      "grad_norm": 0.17313100397586823,
+      "learning_rate": 0.0001378778885610576,
+      "loss": 1.3232,
+      "step": 80
+    },
+    {
+      "epoch": 1.6326530612244898,
+      "grad_norm": 0.17237025499343872,
+      "learning_rate": 0.00013630938600064747,
+      "loss": 1.3406,
+      "step": 81
+    },
+    {
+      "epoch": 1.6530612244897958,
+      "grad_norm": 0.19658459722995758,
+      "learning_rate": 0.00013473052528448201,
+      "loss": 1.3114,
+      "step": 82
+    },
+    {
+      "epoch": 1.6734693877551021,
+      "grad_norm": 0.20599938929080963,
+      "learning_rate": 0.0001331417568218636,
+      "loss": 1.3288,
+      "step": 83
+    },
+    {
+      "epoch": 1.693877551020408,
+      "grad_norm": 0.17759399116039276,
+      "learning_rate": 0.00013154353384852558,
+      "loss": 1.2995,
+      "step": 84
+    },
+    {
+      "epoch": 1.7142857142857144,
+      "grad_norm": 0.18712250888347626,
+      "learning_rate": 0.00012993631229733582,
+      "loss": 1.2895,
+      "step": 85
+    },
+    {
+      "epoch": 1.7346938775510203,
+      "grad_norm": 0.1991330236196518,
+      "learning_rate": 0.00012832055066823038,
+      "loss": 1.2886,
+      "step": 86
+    },
+    {
+      "epoch": 1.7551020408163265,
+      "grad_norm": 0.22125203907489777,
+      "learning_rate": 0.00012669670989741517,
+      "loss": 1.3233,
+      "step": 87
+    },
+    {
+      "epoch": 1.7755102040816326,
+      "grad_norm": 0.2052813619375229,
+      "learning_rate": 0.00012506525322587207,
+      "loss": 1.3079,
+      "step": 88
+    },
+    {
+      "epoch": 1.7959183673469388,
+      "grad_norm": 0.19290736317634583,
+      "learning_rate": 0.00012342664606720822,
+      "loss": 1.3174,
+      "step": 89
+    },
+    {
+      "epoch": 1.816326530612245,
+      "grad_norm": 0.20912542939186096,
+      "learning_rate": 0.00012178135587488515,
+      "loss": 1.2915,
+      "step": 90
+    },
+    {
+      "epoch": 1.836734693877551,
+      "grad_norm": 0.20760588347911835,
+      "learning_rate": 0.00012012985200886602,
+      "loss": 1.3028,
+      "step": 91
+    },
+    {
+      "epoch": 1.836734693877551,
+      "eval_loss": 1.2795333862304688,
+      "eval_runtime": 270.6525,
+      "eval_samples_per_second": 6.263,
+      "eval_steps_per_second": 3.133,
+      "step": 91
+    },
+    {
+      "epoch": 1.8571428571428572,
+      "grad_norm": 0.1996900886297226,
+      "learning_rate": 0.00011847260560171896,
+      "loss": 1.3119,
+      "step": 92
+    },
+    {
+      "epoch": 1.8775510204081631,
+      "grad_norm": 0.23766876757144928,
+      "learning_rate": 0.00011681008942421483,
+      "loss": 1.2978,
+      "step": 93
+    },
+    {
+      "epoch": 1.8979591836734695,
+      "grad_norm": 0.19782397150993347,
+      "learning_rate": 0.00011514277775045768,
+      "loss": 1.2955,
+      "step": 94
+    },
+    {
+      "epoch": 1.9183673469387754,
+      "grad_norm": 0.22519494593143463,
+      "learning_rate": 0.00011347114622258612,
+      "loss": 1.2957,
+      "step": 95
+    },
+    {
+      "epoch": 1.9387755102040818,
+      "grad_norm": 0.2590245306491852,
+      "learning_rate": 0.00011179567171508463,
+      "loss": 1.2809,
+      "step": 96
+    },
+    {
+      "epoch": 1.9591836734693877,
+      "grad_norm": 0.2235420197248459,
+      "learning_rate": 0.00011011683219874323,
+      "loss": 1.2784,
+      "step": 97
+    },
+    {
+      "epoch": 1.9795918367346939,
+      "grad_norm": 0.285740464925766,
+      "learning_rate": 0.00010843510660430447,
+      "loss": 1.309,
+      "step": 98
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 196,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 49,
+  "total_flos": 2.0418701605994496e+16,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-98/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a9cc4ff61a4799ac22d5c627a0169a20cc75619ae1b5871f2d114f95284d87a6
+size 5816

config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 5632,
+  "max_position_embeddings": 4096,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 22,
+  "num_key_value_heads": 4,
+  "pretraining_tp": 1,
+  "quantization_config": {
+    "_load_in_4bit": false,
+    "_load_in_8bit": true,
+    "bnb_4bit_compute_dtype": "float32",
+    "bnb_4bit_quant_storage": "uint8",
+    "bnb_4bit_quant_type": "fp4",
+    "bnb_4bit_use_double_quant": false,
+    "llm_int8_enable_fp32_cpu_offload": false,
+    "llm_int8_has_fp16_weight": false,
+    "llm_int8_skip_modules": null,
+    "llm_int8_threshold": 6.0,
+    "load_in_4bit": false,
+    "load_in_8bit": true,
+    "quant_method": "bitsandbytes"
+  },
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.40.2",
+  "use_cache": false,
+  "vocab_size": 32000
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723