MilaWang commited on Mar 28

Commit

ae96c31

verified ·

1 Parent(s): 48c7155

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-1078/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-1078/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-1078/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-1078/optimizer.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-1078/rng_state.pth +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-1078/scheduler.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-1078/special_tokens_map.json +24 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-1078/tokenizer.json +0 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-1078/tokenizer.model +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-1078/tokenizer_config.json +0 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-1078/trainer_state.json +838 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-1078/training_args.bin +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-1232/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-1232/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-1232/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-1232/optimizer.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-1232/rng_state.pth +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-1232/scheduler.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-1232/special_tokens_map.json +24 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-1232/tokenizer.json +0 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-1232/tokenizer.model +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-1232/tokenizer_config.json +0 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-1232/trainer_state.json +958 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-1232/training_args.bin +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-154/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-154/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-154/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-154/optimizer.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-154/rng_state.pth +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-154/scheduler.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-154/special_tokens_map.json +24 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-154/tokenizer.json +0 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-154/tokenizer.model +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-154/tokenizer_config.json +0 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-154/trainer_state.json +146 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-154/training_args.bin +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-308/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-308/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-308/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-308/optimizer.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-308/rng_state.pth +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-308/scheduler.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-308/special_tokens_map.json +24 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-308/tokenizer.json +0 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-308/tokenizer.model +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-308/tokenizer_config.json +0 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-308/trainer_state.json +259 -0

Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/adapter_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b15ba179d52b965afc887dcefa50ef00275d51afaf4596b021f286670ab61f5b
+size 109069176

Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-1078/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-1078/adapter_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f69f44a1869bae416f6bd192c9bb397ba330ba667dc8f658da1a5be58dd58594
+size 109069176

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4e034241ec0b0e1b6d9475bdc7e6dbd0e4f7d1591f952ce837b3e4c560824b69
+size 55532666

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4a0af705e9764be37cb1c5818e4e3f6821f669fbc39af27434d248e31c735d08
+size 14244

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:614abd620db6f543e9653055aa3bfe54f6a360a875824c97c15665aaf537aa7c
+size 1064

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
+size 587404

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,838 @@

+{
+  "best_metric": 1.8057786226272583,
+  "best_model_checkpoint": "outputs-001/Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-154",
+  "epoch": 7.0,
+  "eval_steps": 10,
+  "global_step": 1078,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.06493506493506493,
+      "grad_norm": 0.9856782555580139,
+      "learning_rate": 0.0002,
+      "loss": 2.593,
+      "step": 10
+    },
+    {
+      "epoch": 0.12987012987012986,
+      "grad_norm": 1.0205717086791992,
+      "learning_rate": 0.0002,
+      "loss": 2.229,
+      "step": 20
+    },
+    {
+      "epoch": 0.19480519480519481,
+      "grad_norm": 0.7780327200889587,
+      "learning_rate": 0.0002,
+      "loss": 2.0632,
+      "step": 30
+    },
+    {
+      "epoch": 0.2597402597402597,
+      "grad_norm": 0.7994171977043152,
+      "learning_rate": 0.0002,
+      "loss": 2.0682,
+      "step": 40
+    },
+    {
+      "epoch": 0.3246753246753247,
+      "grad_norm": 0.7783251404762268,
+      "learning_rate": 0.0002,
+      "loss": 2.1134,
+      "step": 50
+    },
+    {
+      "epoch": 0.38961038961038963,
+      "grad_norm": 0.7531919479370117,
+      "learning_rate": 0.0002,
+      "loss": 1.9359,
+      "step": 60
+    },
+    {
+      "epoch": 0.45454545454545453,
+      "grad_norm": 0.8411881327629089,
+      "learning_rate": 0.0002,
+      "loss": 1.8795,
+      "step": 70
+    },
+    {
+      "epoch": 0.5194805194805194,
+      "grad_norm": 0.7217594385147095,
+      "learning_rate": 0.0002,
+      "loss": 1.8772,
+      "step": 80
+    },
+    {
+      "epoch": 0.5844155844155844,
+      "grad_norm": 0.8530973792076111,
+      "learning_rate": 0.0002,
+      "loss": 1.906,
+      "step": 90
+    },
+    {
+      "epoch": 0.6493506493506493,
+      "grad_norm": 0.6478861570358276,
+      "learning_rate": 0.0002,
+      "loss": 1.8438,
+      "step": 100
+    },
+    {
+      "epoch": 0.7142857142857143,
+      "grad_norm": 0.6314818263053894,
+      "learning_rate": 0.0002,
+      "loss": 1.801,
+      "step": 110
+    },
+    {
+      "epoch": 0.7792207792207793,
+      "grad_norm": 0.6279414892196655,
+      "learning_rate": 0.0002,
+      "loss": 1.7333,
+      "step": 120
+    },
+    {
+      "epoch": 0.8441558441558441,
+      "grad_norm": 0.6663833856582642,
+      "learning_rate": 0.0002,
+      "loss": 1.7779,
+      "step": 130
+    },
+    {
+      "epoch": 0.9090909090909091,
+      "grad_norm": 0.5576409101486206,
+      "learning_rate": 0.0002,
+      "loss": 1.7262,
+      "step": 140
+    },
+    {
+      "epoch": 0.974025974025974,
+      "grad_norm": 0.6750475764274597,
+      "learning_rate": 0.0002,
+      "loss": 1.7602,
+      "step": 150
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 1.8057786226272583,
+      "eval_runtime": 186.6609,
+      "eval_samples_per_second": 2.855,
+      "eval_steps_per_second": 0.359,
+      "step": 154
+    },
+    {
+      "epoch": 1.0389610389610389,
+      "grad_norm": 0.5803011655807495,
+      "learning_rate": 0.0002,
+      "loss": 1.6961,
+      "step": 160
+    },
+    {
+      "epoch": 1.103896103896104,
+      "grad_norm": 0.6350723505020142,
+      "learning_rate": 0.0002,
+      "loss": 1.7369,
+      "step": 170
+    },
+    {
+      "epoch": 1.1688311688311688,
+      "grad_norm": 0.7430880069732666,
+      "learning_rate": 0.0002,
+      "loss": 1.6487,
+      "step": 180
+    },
+    {
+      "epoch": 1.2337662337662338,
+      "grad_norm": 0.7743862271308899,
+      "learning_rate": 0.0002,
+      "loss": 1.6922,
+      "step": 190
+    },
+    {
+      "epoch": 1.2987012987012987,
+      "grad_norm": 0.644690752029419,
+      "learning_rate": 0.0002,
+      "loss": 1.6812,
+      "step": 200
+    },
+    {
+      "epoch": 1.3636363636363638,
+      "grad_norm": 0.6815120577812195,
+      "learning_rate": 0.0002,
+      "loss": 1.6846,
+      "step": 210
+    },
+    {
+      "epoch": 1.4285714285714286,
+      "grad_norm": 0.6068838238716125,
+      "learning_rate": 0.0002,
+      "loss": 1.7777,
+      "step": 220
+    },
+    {
+      "epoch": 1.4935064935064934,
+      "grad_norm": 0.6361706256866455,
+      "learning_rate": 0.0002,
+      "loss": 1.6935,
+      "step": 230
+    },
+    {
+      "epoch": 1.5584415584415585,
+      "grad_norm": 0.7081064581871033,
+      "learning_rate": 0.0002,
+      "loss": 1.7301,
+      "step": 240
+    },
+    {
+      "epoch": 1.6233766233766234,
+      "grad_norm": 0.638526976108551,
+      "learning_rate": 0.0002,
+      "loss": 1.6151,
+      "step": 250
+    },
+    {
+      "epoch": 1.6883116883116882,
+      "grad_norm": 0.6861023306846619,
+      "learning_rate": 0.0002,
+      "loss": 1.6573,
+      "step": 260
+    },
+    {
+      "epoch": 1.7532467532467533,
+      "grad_norm": 0.75590580701828,
+      "learning_rate": 0.0002,
+      "loss": 1.6843,
+      "step": 270
+    },
+    {
+      "epoch": 1.8181818181818183,
+      "grad_norm": 0.7851096987724304,
+      "learning_rate": 0.0002,
+      "loss": 1.7069,
+      "step": 280
+    },
+    {
+      "epoch": 1.883116883116883,
+      "grad_norm": 0.6292237043380737,
+      "learning_rate": 0.0002,
+      "loss": 1.7264,
+      "step": 290
+    },
+    {
+      "epoch": 1.948051948051948,
+      "grad_norm": 0.6913678646087646,
+      "learning_rate": 0.0002,
+      "loss": 1.7089,
+      "step": 300
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 1.8135713338851929,
+      "eval_runtime": 187.0663,
+      "eval_samples_per_second": 2.849,
+      "eval_steps_per_second": 0.358,
+      "step": 308
+    },
+    {
+      "epoch": 2.012987012987013,
+      "grad_norm": 0.6253831386566162,
+      "learning_rate": 0.0002,
+      "loss": 1.6608,
+      "step": 310
+    },
+    {
+      "epoch": 2.0779220779220777,
+      "grad_norm": 0.9163504242897034,
+      "learning_rate": 0.0002,
+      "loss": 1.5344,
+      "step": 320
+    },
+    {
+      "epoch": 2.142857142857143,
+      "grad_norm": 0.7300911545753479,
+      "learning_rate": 0.0002,
+      "loss": 1.4746,
+      "step": 330
+    },
+    {
+      "epoch": 2.207792207792208,
+      "grad_norm": 0.8673648238182068,
+      "learning_rate": 0.0002,
+      "loss": 1.4508,
+      "step": 340
+    },
+    {
+      "epoch": 2.2727272727272725,
+      "grad_norm": 0.8984062671661377,
+      "learning_rate": 0.0002,
+      "loss": 1.5415,
+      "step": 350
+    },
+    {
+      "epoch": 2.3376623376623376,
+      "grad_norm": 1.0172897577285767,
+      "learning_rate": 0.0002,
+      "loss": 1.483,
+      "step": 360
+    },
+    {
+      "epoch": 2.4025974025974026,
+      "grad_norm": 1.0102241039276123,
+      "learning_rate": 0.0002,
+      "loss": 1.5222,
+      "step": 370
+    },
+    {
+      "epoch": 2.4675324675324677,
+      "grad_norm": 0.8766448497772217,
+      "learning_rate": 0.0002,
+      "loss": 1.4976,
+      "step": 380
+    },
+    {
+      "epoch": 2.5324675324675323,
+      "grad_norm": 0.8568485379219055,
+      "learning_rate": 0.0002,
+      "loss": 1.5209,
+      "step": 390
+    },
+    {
+      "epoch": 2.5974025974025974,
+      "grad_norm": 0.8487656712532043,
+      "learning_rate": 0.0002,
+      "loss": 1.536,
+      "step": 400
+    },
+    {
+      "epoch": 2.6623376623376624,
+      "grad_norm": 0.866093099117279,
+      "learning_rate": 0.0002,
+      "loss": 1.4806,
+      "step": 410
+    },
+    {
+      "epoch": 2.7272727272727275,
+      "grad_norm": 1.0025275945663452,
+      "learning_rate": 0.0002,
+      "loss": 1.5116,
+      "step": 420
+    },
+    {
+      "epoch": 2.792207792207792,
+      "grad_norm": 0.8296443223953247,
+      "learning_rate": 0.0002,
+      "loss": 1.5332,
+      "step": 430
+    },
+    {
+      "epoch": 2.857142857142857,
+      "grad_norm": 0.9941014647483826,
+      "learning_rate": 0.0002,
+      "loss": 1.5849,
+      "step": 440
+    },
+    {
+      "epoch": 2.9220779220779223,
+      "grad_norm": 0.8613234162330627,
+      "learning_rate": 0.0002,
+      "loss": 1.6162,
+      "step": 450
+    },
+    {
+      "epoch": 2.987012987012987,
+      "grad_norm": 0.8564832806587219,
+      "learning_rate": 0.0002,
+      "loss": 1.5041,
+      "step": 460
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 1.8579131364822388,
+      "eval_runtime": 187.734,
+      "eval_samples_per_second": 2.839,
+      "eval_steps_per_second": 0.357,
+      "step": 462
+    },
+    {
+      "epoch": 3.051948051948052,
+      "grad_norm": 1.0442030429840088,
+      "learning_rate": 0.0002,
+      "loss": 1.3853,
+      "step": 470
+    },
+    {
+      "epoch": 3.116883116883117,
+      "grad_norm": 1.243507742881775,
+      "learning_rate": 0.0002,
+      "loss": 1.2958,
+      "step": 480
+    },
+    {
+      "epoch": 3.1818181818181817,
+      "grad_norm": 1.338243842124939,
+      "learning_rate": 0.0002,
+      "loss": 1.3303,
+      "step": 490
+    },
+    {
+      "epoch": 3.2467532467532467,
+      "grad_norm": 1.3856316804885864,
+      "learning_rate": 0.0002,
+      "loss": 1.2034,
+      "step": 500
+    },
+    {
+      "epoch": 3.311688311688312,
+      "grad_norm": 1.3414607048034668,
+      "learning_rate": 0.0002,
+      "loss": 1.2052,
+      "step": 510
+    },
+    {
+      "epoch": 3.3766233766233764,
+      "grad_norm": 1.2239990234375,
+      "learning_rate": 0.0002,
+      "loss": 1.3492,
+      "step": 520
+    },
+    {
+      "epoch": 3.4415584415584415,
+      "grad_norm": 1.3926455974578857,
+      "learning_rate": 0.0002,
+      "loss": 1.2586,
+      "step": 530
+    },
+    {
+      "epoch": 3.5064935064935066,
+      "grad_norm": 1.3495798110961914,
+      "learning_rate": 0.0002,
+      "loss": 1.2829,
+      "step": 540
+    },
+    {
+      "epoch": 3.571428571428571,
+      "grad_norm": 1.6570665836334229,
+      "learning_rate": 0.0002,
+      "loss": 1.3272,
+      "step": 550
+    },
+    {
+      "epoch": 3.6363636363636362,
+      "grad_norm": 1.2888237237930298,
+      "learning_rate": 0.0002,
+      "loss": 1.3111,
+      "step": 560
+    },
+    {
+      "epoch": 3.7012987012987013,
+      "grad_norm": 1.2630363702774048,
+      "learning_rate": 0.0002,
+      "loss": 1.2834,
+      "step": 570
+    },
+    {
+      "epoch": 3.7662337662337664,
+      "grad_norm": 1.2843817472457886,
+      "learning_rate": 0.0002,
+      "loss": 1.2856,
+      "step": 580
+    },
+    {
+      "epoch": 3.8311688311688314,
+      "grad_norm": 1.1630159616470337,
+      "learning_rate": 0.0002,
+      "loss": 1.3166,
+      "step": 590
+    },
+    {
+      "epoch": 3.896103896103896,
+      "grad_norm": 1.2588003873825073,
+      "learning_rate": 0.0002,
+      "loss": 1.3049,
+      "step": 600
+    },
+    {
+      "epoch": 3.961038961038961,
+      "grad_norm": 1.1966116428375244,
+      "learning_rate": 0.0002,
+      "loss": 1.2935,
+      "step": 610
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 1.9952489137649536,
+      "eval_runtime": 184.0309,
+      "eval_samples_per_second": 2.896,
+      "eval_steps_per_second": 0.364,
+      "step": 616
+    },
+    {
+      "epoch": 4.025974025974026,
+      "grad_norm": 1.4200360774993896,
+      "learning_rate": 0.0002,
+      "loss": 1.2198,
+      "step": 620
+    },
+    {
+      "epoch": 4.090909090909091,
+      "grad_norm": 1.5336390733718872,
+      "learning_rate": 0.0002,
+      "loss": 1.0751,
+      "step": 630
+    },
+    {
+      "epoch": 4.1558441558441555,
+      "grad_norm": 1.9104152917861938,
+      "learning_rate": 0.0002,
+      "loss": 1.0175,
+      "step": 640
+    },
+    {
+      "epoch": 4.220779220779221,
+      "grad_norm": 1.6754790544509888,
+      "learning_rate": 0.0002,
+      "loss": 1.0111,
+      "step": 650
+    },
+    {
+      "epoch": 4.285714285714286,
+      "grad_norm": 1.7546768188476562,
+      "learning_rate": 0.0002,
+      "loss": 1.0242,
+      "step": 660
+    },
+    {
+      "epoch": 4.35064935064935,
+      "grad_norm": 1.727729320526123,
+      "learning_rate": 0.0002,
+      "loss": 1.0342,
+      "step": 670
+    },
+    {
+      "epoch": 4.415584415584416,
+      "grad_norm": 1.7832167148590088,
+      "learning_rate": 0.0002,
+      "loss": 0.9964,
+      "step": 680
+    },
+    {
+      "epoch": 4.48051948051948,
+      "grad_norm": 1.7178401947021484,
+      "learning_rate": 0.0002,
+      "loss": 1.1214,
+      "step": 690
+    },
+    {
+      "epoch": 4.545454545454545,
+      "grad_norm": 1.5840944051742554,
+      "learning_rate": 0.0002,
+      "loss": 1.0412,
+      "step": 700
+    },
+    {
+      "epoch": 4.6103896103896105,
+      "grad_norm": 1.5642529726028442,
+      "learning_rate": 0.0002,
+      "loss": 1.0194,
+      "step": 710
+    },
+    {
+      "epoch": 4.675324675324675,
+      "grad_norm": 1.588742733001709,
+      "learning_rate": 0.0002,
+      "loss": 1.0477,
+      "step": 720
+    },
+    {
+      "epoch": 4.740259740259741,
+      "grad_norm": 1.6103804111480713,
+      "learning_rate": 0.0002,
+      "loss": 1.0854,
+      "step": 730
+    },
+    {
+      "epoch": 4.805194805194805,
+      "grad_norm": 1.5183384418487549,
+      "learning_rate": 0.0002,
+      "loss": 1.0604,
+      "step": 740
+    },
+    {
+      "epoch": 4.87012987012987,
+      "grad_norm": 1.3329721689224243,
+      "learning_rate": 0.0002,
+      "loss": 1.1396,
+      "step": 750
+    },
+    {
+      "epoch": 4.935064935064935,
+      "grad_norm": 1.8377444744110107,
+      "learning_rate": 0.0002,
+      "loss": 1.0527,
+      "step": 760
+    },
+    {
+      "epoch": 5.0,
+      "grad_norm": 1.6057950258255005,
+      "learning_rate": 0.0002,
+      "loss": 1.0263,
+      "step": 770
+    },
+    {
+      "epoch": 5.0,
+      "eval_loss": 2.137923240661621,
+      "eval_runtime": 186.6056,
+      "eval_samples_per_second": 2.856,
+      "eval_steps_per_second": 0.359,
+      "step": 770
+    },
+    {
+      "epoch": 5.064935064935065,
+      "grad_norm": 2.6040709018707275,
+      "learning_rate": 0.0002,
+      "loss": 0.7818,
+      "step": 780
+    },
+    {
+      "epoch": 5.12987012987013,
+      "grad_norm": 2.240368366241455,
+      "learning_rate": 0.0002,
+      "loss": 0.7866,
+      "step": 790
+    },
+    {
+      "epoch": 5.194805194805195,
+      "grad_norm": 2.5823593139648438,
+      "learning_rate": 0.0002,
+      "loss": 0.7857,
+      "step": 800
+    },
+    {
+      "epoch": 5.259740259740259,
+      "grad_norm": 2.326618194580078,
+      "learning_rate": 0.0002,
+      "loss": 0.7576,
+      "step": 810
+    },
+    {
+      "epoch": 5.324675324675325,
+      "grad_norm": 1.86579430103302,
+      "learning_rate": 0.0002,
+      "loss": 0.797,
+      "step": 820
+    },
+    {
+      "epoch": 5.3896103896103895,
+      "grad_norm": 1.8907891511917114,
+      "learning_rate": 0.0002,
+      "loss": 0.8163,
+      "step": 830
+    },
+    {
+      "epoch": 5.454545454545454,
+      "grad_norm": 1.8598380088806152,
+      "learning_rate": 0.0002,
+      "loss": 0.7802,
+      "step": 840
+    },
+    {
+      "epoch": 5.51948051948052,
+      "grad_norm": 2.2666990756988525,
+      "learning_rate": 0.0002,
+      "loss": 0.813,
+      "step": 850
+    },
+    {
+      "epoch": 5.584415584415584,
+      "grad_norm": 2.06738018989563,
+      "learning_rate": 0.0002,
+      "loss": 0.8375,
+      "step": 860
+    },
+    {
+      "epoch": 5.64935064935065,
+      "grad_norm": 2.180816888809204,
+      "learning_rate": 0.0002,
+      "loss": 0.8116,
+      "step": 870
+    },
+    {
+      "epoch": 5.714285714285714,
+      "grad_norm": 2.3028717041015625,
+      "learning_rate": 0.0002,
+      "loss": 0.893,
+      "step": 880
+    },
+    {
+      "epoch": 5.779220779220779,
+      "grad_norm": 2.158668041229248,
+      "learning_rate": 0.0002,
+      "loss": 0.8455,
+      "step": 890
+    },
+    {
+      "epoch": 5.8441558441558445,
+      "grad_norm": 2.0822510719299316,
+      "learning_rate": 0.0002,
+      "loss": 0.8719,
+      "step": 900
+    },
+    {
+      "epoch": 5.909090909090909,
+      "grad_norm": 2.0678226947784424,
+      "learning_rate": 0.0002,
+      "loss": 0.8302,
+      "step": 910
+    },
+    {
+      "epoch": 5.974025974025974,
+      "grad_norm": 1.9154915809631348,
+      "learning_rate": 0.0002,
+      "loss": 0.8747,
+      "step": 920
+    },
+    {
+      "epoch": 6.0,
+      "eval_loss": 2.3475306034088135,
+      "eval_runtime": 187.3275,
+      "eval_samples_per_second": 2.845,
+      "eval_steps_per_second": 0.358,
+      "step": 924
+    },
+    {
+      "epoch": 6.038961038961039,
+      "grad_norm": 2.5734288692474365,
+      "learning_rate": 0.0002,
+      "loss": 0.6868,
+      "step": 930
+    },
+    {
+      "epoch": 6.103896103896104,
+      "grad_norm": 2.1251583099365234,
+      "learning_rate": 0.0002,
+      "loss": 0.581,
+      "step": 940
+    },
+    {
+      "epoch": 6.1688311688311686,
+      "grad_norm": 2.346284866333008,
+      "learning_rate": 0.0002,
+      "loss": 0.5784,
+      "step": 950
+    },
+    {
+      "epoch": 6.233766233766234,
+      "grad_norm": 2.262770175933838,
+      "learning_rate": 0.0002,
+      "loss": 0.6225,
+      "step": 960
+    },
+    {
+      "epoch": 6.298701298701299,
+      "grad_norm": 2.5575172901153564,
+      "learning_rate": 0.0002,
+      "loss": 0.5593,
+      "step": 970
+    },
+    {
+      "epoch": 6.363636363636363,
+      "grad_norm": 2.811757802963257,
+      "learning_rate": 0.0002,
+      "loss": 0.6017,
+      "step": 980
+    },
+    {
+      "epoch": 6.428571428571429,
+      "grad_norm": 2.3052585124969482,
+      "learning_rate": 0.0002,
+      "loss": 0.6134,
+      "step": 990
+    },
+    {
+      "epoch": 6.4935064935064934,
+      "grad_norm": 2.2371861934661865,
+      "learning_rate": 0.0002,
+      "loss": 0.639,
+      "step": 1000
+    },
+    {
+      "epoch": 6.558441558441558,
+      "grad_norm": 2.974090814590454,
+      "learning_rate": 0.0002,
+      "loss": 0.629,
+      "step": 1010
+    },
+    {
+      "epoch": 6.623376623376624,
+      "grad_norm": 3.1924889087677,
+      "learning_rate": 0.0002,
+      "loss": 0.6068,
+      "step": 1020
+    },
+    {
+      "epoch": 6.688311688311688,
+      "grad_norm": 2.322031021118164,
+      "learning_rate": 0.0002,
+      "loss": 0.6519,
+      "step": 1030
+    },
+    {
+      "epoch": 6.753246753246753,
+      "grad_norm": 4.84075927734375,
+      "learning_rate": 0.0002,
+      "loss": 0.6218,
+      "step": 1040
+    },
+    {
+      "epoch": 6.818181818181818,
+      "grad_norm": 2.509589433670044,
+      "learning_rate": 0.0002,
+      "loss": 0.6521,
+      "step": 1050
+    },
+    {
+      "epoch": 6.883116883116883,
+      "grad_norm": 2.2627809047698975,
+      "learning_rate": 0.0002,
+      "loss": 0.692,
+      "step": 1060
+    },
+    {
+      "epoch": 6.948051948051948,
+      "grad_norm": 2.528601884841919,
+      "learning_rate": 0.0002,
+      "loss": 0.683,
+      "step": 1070
+    },
+    {
+      "epoch": 7.0,
+      "eval_loss": 2.6560964584350586,
+      "eval_runtime": 187.5894,
+      "eval_samples_per_second": 2.841,
+      "eval_steps_per_second": 0.357,
+      "step": 1078
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1232,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.72969553903616e+16,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:36b4f8ab66999c14764da42af0b1d3f87c70ee10ed09cb969b5621d5b7a29a6a
+size 5560

Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-1232/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4e9c3f2ec4f8288cc1ee6081b53b05b2abb1d7d46f8efb72e38ca20603f97c34
+size 109069176

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3393bede28d0abccd71978988e9467e799fd026cd457f5f2e3b58fb9bf5cc92b
+size 55532666

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2874ac51c50bd318f83600cf5deb9dae131acc315311ba94a082af6ef26241d7
+size 14244

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:442644f04018009e339745676d855af49e719afd045348ca5915b5bd0a5436e0
+size 1064

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
+size 587404

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,958 @@

+{
+  "best_metric": 1.8057786226272583,
+  "best_model_checkpoint": "outputs-001/Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-154",
+  "epoch": 8.0,
+  "eval_steps": 10,
+  "global_step": 1232,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.06493506493506493,
+      "grad_norm": 0.9856782555580139,
+      "learning_rate": 0.0002,
+      "loss": 2.593,
+      "step": 10
+    },
+    {
+      "epoch": 0.12987012987012986,
+      "grad_norm": 1.0205717086791992,
+      "learning_rate": 0.0002,
+      "loss": 2.229,
+      "step": 20
+    },
+    {
+      "epoch": 0.19480519480519481,
+      "grad_norm": 0.7780327200889587,
+      "learning_rate": 0.0002,
+      "loss": 2.0632,
+      "step": 30
+    },
+    {
+      "epoch": 0.2597402597402597,
+      "grad_norm": 0.7994171977043152,
+      "learning_rate": 0.0002,
+      "loss": 2.0682,
+      "step": 40
+    },
+    {
+      "epoch": 0.3246753246753247,
+      "grad_norm": 0.7783251404762268,
+      "learning_rate": 0.0002,
+      "loss": 2.1134,
+      "step": 50
+    },
+    {
+      "epoch": 0.38961038961038963,
+      "grad_norm": 0.7531919479370117,
+      "learning_rate": 0.0002,
+      "loss": 1.9359,
+      "step": 60
+    },
+    {
+      "epoch": 0.45454545454545453,
+      "grad_norm": 0.8411881327629089,
+      "learning_rate": 0.0002,
+      "loss": 1.8795,
+      "step": 70
+    },
+    {
+      "epoch": 0.5194805194805194,
+      "grad_norm": 0.7217594385147095,
+      "learning_rate": 0.0002,
+      "loss": 1.8772,
+      "step": 80
+    },
+    {
+      "epoch": 0.5844155844155844,
+      "grad_norm": 0.8530973792076111,
+      "learning_rate": 0.0002,
+      "loss": 1.906,
+      "step": 90
+    },
+    {
+      "epoch": 0.6493506493506493,
+      "grad_norm": 0.6478861570358276,
+      "learning_rate": 0.0002,
+      "loss": 1.8438,
+      "step": 100
+    },
+    {
+      "epoch": 0.7142857142857143,
+      "grad_norm": 0.6314818263053894,
+      "learning_rate": 0.0002,
+      "loss": 1.801,
+      "step": 110
+    },
+    {
+      "epoch": 0.7792207792207793,
+      "grad_norm": 0.6279414892196655,
+      "learning_rate": 0.0002,
+      "loss": 1.7333,
+      "step": 120
+    },
+    {
+      "epoch": 0.8441558441558441,
+      "grad_norm": 0.6663833856582642,
+      "learning_rate": 0.0002,
+      "loss": 1.7779,
+      "step": 130
+    },
+    {
+      "epoch": 0.9090909090909091,
+      "grad_norm": 0.5576409101486206,
+      "learning_rate": 0.0002,
+      "loss": 1.7262,
+      "step": 140
+    },
+    {
+      "epoch": 0.974025974025974,
+      "grad_norm": 0.6750475764274597,
+      "learning_rate": 0.0002,
+      "loss": 1.7602,
+      "step": 150
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 1.8057786226272583,
+      "eval_runtime": 186.6609,
+      "eval_samples_per_second": 2.855,
+      "eval_steps_per_second": 0.359,
+      "step": 154
+    },
+    {
+      "epoch": 1.0389610389610389,
+      "grad_norm": 0.5803011655807495,
+      "learning_rate": 0.0002,
+      "loss": 1.6961,
+      "step": 160
+    },
+    {
+      "epoch": 1.103896103896104,
+      "grad_norm": 0.6350723505020142,
+      "learning_rate": 0.0002,
+      "loss": 1.7369,
+      "step": 170
+    },
+    {
+      "epoch": 1.1688311688311688,
+      "grad_norm": 0.7430880069732666,
+      "learning_rate": 0.0002,
+      "loss": 1.6487,
+      "step": 180
+    },
+    {
+      "epoch": 1.2337662337662338,
+      "grad_norm": 0.7743862271308899,
+      "learning_rate": 0.0002,
+      "loss": 1.6922,
+      "step": 190
+    },
+    {
+      "epoch": 1.2987012987012987,
+      "grad_norm": 0.644690752029419,
+      "learning_rate": 0.0002,
+      "loss": 1.6812,
+      "step": 200
+    },
+    {
+      "epoch": 1.3636363636363638,
+      "grad_norm": 0.6815120577812195,
+      "learning_rate": 0.0002,
+      "loss": 1.6846,
+      "step": 210
+    },
+    {
+      "epoch": 1.4285714285714286,
+      "grad_norm": 0.6068838238716125,
+      "learning_rate": 0.0002,
+      "loss": 1.7777,
+      "step": 220
+    },
+    {
+      "epoch": 1.4935064935064934,
+      "grad_norm": 0.6361706256866455,
+      "learning_rate": 0.0002,
+      "loss": 1.6935,
+      "step": 230
+    },
+    {
+      "epoch": 1.5584415584415585,
+      "grad_norm": 0.7081064581871033,
+      "learning_rate": 0.0002,
+      "loss": 1.7301,
+      "step": 240
+    },
+    {
+      "epoch": 1.6233766233766234,
+      "grad_norm": 0.638526976108551,
+      "learning_rate": 0.0002,
+      "loss": 1.6151,
+      "step": 250
+    },
+    {
+      "epoch": 1.6883116883116882,
+      "grad_norm": 0.6861023306846619,
+      "learning_rate": 0.0002,
+      "loss": 1.6573,
+      "step": 260
+    },
+    {
+      "epoch": 1.7532467532467533,
+      "grad_norm": 0.75590580701828,
+      "learning_rate": 0.0002,
+      "loss": 1.6843,
+      "step": 270
+    },
+    {
+      "epoch": 1.8181818181818183,
+      "grad_norm": 0.7851096987724304,
+      "learning_rate": 0.0002,
+      "loss": 1.7069,
+      "step": 280
+    },
+    {
+      "epoch": 1.883116883116883,
+      "grad_norm": 0.6292237043380737,
+      "learning_rate": 0.0002,
+      "loss": 1.7264,
+      "step": 290
+    },
+    {
+      "epoch": 1.948051948051948,
+      "grad_norm": 0.6913678646087646,
+      "learning_rate": 0.0002,
+      "loss": 1.7089,
+      "step": 300
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 1.8135713338851929,
+      "eval_runtime": 187.0663,
+      "eval_samples_per_second": 2.849,
+      "eval_steps_per_second": 0.358,
+      "step": 308
+    },
+    {
+      "epoch": 2.012987012987013,
+      "grad_norm": 0.6253831386566162,
+      "learning_rate": 0.0002,
+      "loss": 1.6608,
+      "step": 310
+    },
+    {
+      "epoch": 2.0779220779220777,
+      "grad_norm": 0.9163504242897034,
+      "learning_rate": 0.0002,
+      "loss": 1.5344,
+      "step": 320
+    },
+    {
+      "epoch": 2.142857142857143,
+      "grad_norm": 0.7300911545753479,
+      "learning_rate": 0.0002,
+      "loss": 1.4746,
+      "step": 330
+    },
+    {
+      "epoch": 2.207792207792208,
+      "grad_norm": 0.8673648238182068,
+      "learning_rate": 0.0002,
+      "loss": 1.4508,
+      "step": 340
+    },
+    {
+      "epoch": 2.2727272727272725,
+      "grad_norm": 0.8984062671661377,
+      "learning_rate": 0.0002,
+      "loss": 1.5415,
+      "step": 350
+    },
+    {
+      "epoch": 2.3376623376623376,
+      "grad_norm": 1.0172897577285767,
+      "learning_rate": 0.0002,
+      "loss": 1.483,
+      "step": 360
+    },
+    {
+      "epoch": 2.4025974025974026,
+      "grad_norm": 1.0102241039276123,
+      "learning_rate": 0.0002,
+      "loss": 1.5222,
+      "step": 370
+    },
+    {
+      "epoch": 2.4675324675324677,
+      "grad_norm": 0.8766448497772217,
+      "learning_rate": 0.0002,
+      "loss": 1.4976,
+      "step": 380
+    },
+    {
+      "epoch": 2.5324675324675323,
+      "grad_norm": 0.8568485379219055,
+      "learning_rate": 0.0002,
+      "loss": 1.5209,
+      "step": 390
+    },
+    {
+      "epoch": 2.5974025974025974,
+      "grad_norm": 0.8487656712532043,
+      "learning_rate": 0.0002,
+      "loss": 1.536,
+      "step": 400
+    },
+    {
+      "epoch": 2.6623376623376624,
+      "grad_norm": 0.866093099117279,
+      "learning_rate": 0.0002,
+      "loss": 1.4806,
+      "step": 410
+    },
+    {
+      "epoch": 2.7272727272727275,
+      "grad_norm": 1.0025275945663452,
+      "learning_rate": 0.0002,
+      "loss": 1.5116,
+      "step": 420
+    },
+    {
+      "epoch": 2.792207792207792,
+      "grad_norm": 0.8296443223953247,
+      "learning_rate": 0.0002,
+      "loss": 1.5332,
+      "step": 430
+    },
+    {
+      "epoch": 2.857142857142857,
+      "grad_norm": 0.9941014647483826,
+      "learning_rate": 0.0002,
+      "loss": 1.5849,
+      "step": 440
+    },
+    {
+      "epoch": 2.9220779220779223,
+      "grad_norm": 0.8613234162330627,
+      "learning_rate": 0.0002,
+      "loss": 1.6162,
+      "step": 450
+    },
+    {
+      "epoch": 2.987012987012987,
+      "grad_norm": 0.8564832806587219,
+      "learning_rate": 0.0002,
+      "loss": 1.5041,
+      "step": 460
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 1.8579131364822388,
+      "eval_runtime": 187.734,
+      "eval_samples_per_second": 2.839,
+      "eval_steps_per_second": 0.357,
+      "step": 462
+    },
+    {
+      "epoch": 3.051948051948052,
+      "grad_norm": 1.0442030429840088,
+      "learning_rate": 0.0002,
+      "loss": 1.3853,
+      "step": 470
+    },
+    {
+      "epoch": 3.116883116883117,
+      "grad_norm": 1.243507742881775,
+      "learning_rate": 0.0002,
+      "loss": 1.2958,
+      "step": 480
+    },
+    {
+      "epoch": 3.1818181818181817,
+      "grad_norm": 1.338243842124939,
+      "learning_rate": 0.0002,
+      "loss": 1.3303,
+      "step": 490
+    },
+    {
+      "epoch": 3.2467532467532467,
+      "grad_norm": 1.3856316804885864,
+      "learning_rate": 0.0002,
+      "loss": 1.2034,
+      "step": 500
+    },
+    {
+      "epoch": 3.311688311688312,
+      "grad_norm": 1.3414607048034668,
+      "learning_rate": 0.0002,
+      "loss": 1.2052,
+      "step": 510
+    },
+    {
+      "epoch": 3.3766233766233764,
+      "grad_norm": 1.2239990234375,
+      "learning_rate": 0.0002,
+      "loss": 1.3492,
+      "step": 520
+    },
+    {
+      "epoch": 3.4415584415584415,
+      "grad_norm": 1.3926455974578857,
+      "learning_rate": 0.0002,
+      "loss": 1.2586,
+      "step": 530
+    },
+    {
+      "epoch": 3.5064935064935066,
+      "grad_norm": 1.3495798110961914,
+      "learning_rate": 0.0002,
+      "loss": 1.2829,
+      "step": 540
+    },
+    {
+      "epoch": 3.571428571428571,
+      "grad_norm": 1.6570665836334229,
+      "learning_rate": 0.0002,
+      "loss": 1.3272,
+      "step": 550
+    },
+    {
+      "epoch": 3.6363636363636362,
+      "grad_norm": 1.2888237237930298,
+      "learning_rate": 0.0002,
+      "loss": 1.3111,
+      "step": 560
+    },
+    {
+      "epoch": 3.7012987012987013,
+      "grad_norm": 1.2630363702774048,
+      "learning_rate": 0.0002,
+      "loss": 1.2834,
+      "step": 570
+    },
+    {
+      "epoch": 3.7662337662337664,
+      "grad_norm": 1.2843817472457886,
+      "learning_rate": 0.0002,
+      "loss": 1.2856,
+      "step": 580
+    },
+    {
+      "epoch": 3.8311688311688314,
+      "grad_norm": 1.1630159616470337,
+      "learning_rate": 0.0002,
+      "loss": 1.3166,
+      "step": 590
+    },
+    {
+      "epoch": 3.896103896103896,
+      "grad_norm": 1.2588003873825073,
+      "learning_rate": 0.0002,
+      "loss": 1.3049,
+      "step": 600
+    },
+    {
+      "epoch": 3.961038961038961,
+      "grad_norm": 1.1966116428375244,
+      "learning_rate": 0.0002,
+      "loss": 1.2935,
+      "step": 610
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 1.9952489137649536,
+      "eval_runtime": 184.0309,
+      "eval_samples_per_second": 2.896,
+      "eval_steps_per_second": 0.364,
+      "step": 616
+    },
+    {
+      "epoch": 4.025974025974026,
+      "grad_norm": 1.4200360774993896,
+      "learning_rate": 0.0002,
+      "loss": 1.2198,
+      "step": 620
+    },
+    {
+      "epoch": 4.090909090909091,
+      "grad_norm": 1.5336390733718872,
+      "learning_rate": 0.0002,
+      "loss": 1.0751,
+      "step": 630
+    },
+    {
+      "epoch": 4.1558441558441555,
+      "grad_norm": 1.9104152917861938,
+      "learning_rate": 0.0002,
+      "loss": 1.0175,
+      "step": 640
+    },
+    {
+      "epoch": 4.220779220779221,
+      "grad_norm": 1.6754790544509888,
+      "learning_rate": 0.0002,
+      "loss": 1.0111,
+      "step": 650
+    },
+    {
+      "epoch": 4.285714285714286,
+      "grad_norm": 1.7546768188476562,
+      "learning_rate": 0.0002,
+      "loss": 1.0242,
+      "step": 660
+    },
+    {
+      "epoch": 4.35064935064935,
+      "grad_norm": 1.727729320526123,
+      "learning_rate": 0.0002,
+      "loss": 1.0342,
+      "step": 670
+    },
+    {
+      "epoch": 4.415584415584416,
+      "grad_norm": 1.7832167148590088,
+      "learning_rate": 0.0002,
+      "loss": 0.9964,
+      "step": 680
+    },
+    {
+      "epoch": 4.48051948051948,
+      "grad_norm": 1.7178401947021484,
+      "learning_rate": 0.0002,
+      "loss": 1.1214,
+      "step": 690
+    },
+    {
+      "epoch": 4.545454545454545,
+      "grad_norm": 1.5840944051742554,
+      "learning_rate": 0.0002,
+      "loss": 1.0412,
+      "step": 700
+    },
+    {
+      "epoch": 4.6103896103896105,
+      "grad_norm": 1.5642529726028442,
+      "learning_rate": 0.0002,
+      "loss": 1.0194,
+      "step": 710
+    },
+    {
+      "epoch": 4.675324675324675,
+      "grad_norm": 1.588742733001709,
+      "learning_rate": 0.0002,
+      "loss": 1.0477,
+      "step": 720
+    },
+    {
+      "epoch": 4.740259740259741,
+      "grad_norm": 1.6103804111480713,
+      "learning_rate": 0.0002,
+      "loss": 1.0854,
+      "step": 730
+    },
+    {
+      "epoch": 4.805194805194805,
+      "grad_norm": 1.5183384418487549,
+      "learning_rate": 0.0002,
+      "loss": 1.0604,
+      "step": 740
+    },
+    {
+      "epoch": 4.87012987012987,
+      "grad_norm": 1.3329721689224243,
+      "learning_rate": 0.0002,
+      "loss": 1.1396,
+      "step": 750
+    },
+    {
+      "epoch": 4.935064935064935,
+      "grad_norm": 1.8377444744110107,
+      "learning_rate": 0.0002,
+      "loss": 1.0527,
+      "step": 760
+    },
+    {
+      "epoch": 5.0,
+      "grad_norm": 1.6057950258255005,
+      "learning_rate": 0.0002,
+      "loss": 1.0263,
+      "step": 770
+    },
+    {
+      "epoch": 5.0,
+      "eval_loss": 2.137923240661621,
+      "eval_runtime": 186.6056,
+      "eval_samples_per_second": 2.856,
+      "eval_steps_per_second": 0.359,
+      "step": 770
+    },
+    {
+      "epoch": 5.064935064935065,
+      "grad_norm": 2.6040709018707275,
+      "learning_rate": 0.0002,
+      "loss": 0.7818,
+      "step": 780
+    },
+    {
+      "epoch": 5.12987012987013,
+      "grad_norm": 2.240368366241455,
+      "learning_rate": 0.0002,
+      "loss": 0.7866,
+      "step": 790
+    },
+    {
+      "epoch": 5.194805194805195,
+      "grad_norm": 2.5823593139648438,
+      "learning_rate": 0.0002,
+      "loss": 0.7857,
+      "step": 800
+    },
+    {
+      "epoch": 5.259740259740259,
+      "grad_norm": 2.326618194580078,
+      "learning_rate": 0.0002,
+      "loss": 0.7576,
+      "step": 810
+    },
+    {
+      "epoch": 5.324675324675325,
+      "grad_norm": 1.86579430103302,
+      "learning_rate": 0.0002,
+      "loss": 0.797,
+      "step": 820
+    },
+    {
+      "epoch": 5.3896103896103895,
+      "grad_norm": 1.8907891511917114,
+      "learning_rate": 0.0002,
+      "loss": 0.8163,
+      "step": 830
+    },
+    {
+      "epoch": 5.454545454545454,
+      "grad_norm": 1.8598380088806152,
+      "learning_rate": 0.0002,
+      "loss": 0.7802,
+      "step": 840
+    },
+    {
+      "epoch": 5.51948051948052,
+      "grad_norm": 2.2666990756988525,
+      "learning_rate": 0.0002,
+      "loss": 0.813,
+      "step": 850
+    },
+    {
+      "epoch": 5.584415584415584,
+      "grad_norm": 2.06738018989563,
+      "learning_rate": 0.0002,
+      "loss": 0.8375,
+      "step": 860
+    },
+    {
+      "epoch": 5.64935064935065,
+      "grad_norm": 2.180816888809204,
+      "learning_rate": 0.0002,
+      "loss": 0.8116,
+      "step": 870
+    },
+    {
+      "epoch": 5.714285714285714,
+      "grad_norm": 2.3028717041015625,
+      "learning_rate": 0.0002,
+      "loss": 0.893,
+      "step": 880
+    },
+    {
+      "epoch": 5.779220779220779,
+      "grad_norm": 2.158668041229248,
+      "learning_rate": 0.0002,
+      "loss": 0.8455,
+      "step": 890
+    },
+    {
+      "epoch": 5.8441558441558445,
+      "grad_norm": 2.0822510719299316,
+      "learning_rate": 0.0002,
+      "loss": 0.8719,
+      "step": 900
+    },
+    {
+      "epoch": 5.909090909090909,
+      "grad_norm": 2.0678226947784424,
+      "learning_rate": 0.0002,
+      "loss": 0.8302,
+      "step": 910
+    },
+    {
+      "epoch": 5.974025974025974,
+      "grad_norm": 1.9154915809631348,
+      "learning_rate": 0.0002,
+      "loss": 0.8747,
+      "step": 920
+    },
+    {
+      "epoch": 6.0,
+      "eval_loss": 2.3475306034088135,
+      "eval_runtime": 187.3275,
+      "eval_samples_per_second": 2.845,
+      "eval_steps_per_second": 0.358,
+      "step": 924
+    },
+    {
+      "epoch": 6.038961038961039,
+      "grad_norm": 2.5734288692474365,
+      "learning_rate": 0.0002,
+      "loss": 0.6868,
+      "step": 930
+    },
+    {
+      "epoch": 6.103896103896104,
+      "grad_norm": 2.1251583099365234,
+      "learning_rate": 0.0002,
+      "loss": 0.581,
+      "step": 940
+    },
+    {
+      "epoch": 6.1688311688311686,
+      "grad_norm": 2.346284866333008,
+      "learning_rate": 0.0002,
+      "loss": 0.5784,
+      "step": 950
+    },
+    {
+      "epoch": 6.233766233766234,
+      "grad_norm": 2.262770175933838,
+      "learning_rate": 0.0002,
+      "loss": 0.6225,
+      "step": 960
+    },
+    {
+      "epoch": 6.298701298701299,
+      "grad_norm": 2.5575172901153564,
+      "learning_rate": 0.0002,
+      "loss": 0.5593,
+      "step": 970
+    },
+    {
+      "epoch": 6.363636363636363,
+      "grad_norm": 2.811757802963257,
+      "learning_rate": 0.0002,
+      "loss": 0.6017,
+      "step": 980
+    },
+    {
+      "epoch": 6.428571428571429,
+      "grad_norm": 2.3052585124969482,
+      "learning_rate": 0.0002,
+      "loss": 0.6134,
+      "step": 990
+    },
+    {
+      "epoch": 6.4935064935064934,
+      "grad_norm": 2.2371861934661865,
+      "learning_rate": 0.0002,
+      "loss": 0.639,
+      "step": 1000
+    },
+    {
+      "epoch": 6.558441558441558,
+      "grad_norm": 2.974090814590454,
+      "learning_rate": 0.0002,
+      "loss": 0.629,
+      "step": 1010
+    },
+    {
+      "epoch": 6.623376623376624,
+      "grad_norm": 3.1924889087677,
+      "learning_rate": 0.0002,
+      "loss": 0.6068,
+      "step": 1020
+    },
+    {
+      "epoch": 6.688311688311688,
+      "grad_norm": 2.322031021118164,
+      "learning_rate": 0.0002,
+      "loss": 0.6519,
+      "step": 1030
+    },
+    {
+      "epoch": 6.753246753246753,
+      "grad_norm": 4.84075927734375,
+      "learning_rate": 0.0002,
+      "loss": 0.6218,
+      "step": 1040
+    },
+    {
+      "epoch": 6.818181818181818,
+      "grad_norm": 2.509589433670044,
+      "learning_rate": 0.0002,
+      "loss": 0.6521,
+      "step": 1050
+    },
+    {
+      "epoch": 6.883116883116883,
+      "grad_norm": 2.2627809047698975,
+      "learning_rate": 0.0002,
+      "loss": 0.692,
+      "step": 1060
+    },
+    {
+      "epoch": 6.948051948051948,
+      "grad_norm": 2.528601884841919,
+      "learning_rate": 0.0002,
+      "loss": 0.683,
+      "step": 1070
+    },
+    {
+      "epoch": 7.0,
+      "eval_loss": 2.6560964584350586,
+      "eval_runtime": 187.5894,
+      "eval_samples_per_second": 2.841,
+      "eval_steps_per_second": 0.357,
+      "step": 1078
+    },
+    {
+      "epoch": 7.012987012987013,
+      "grad_norm": 2.4132769107818604,
+      "learning_rate": 0.0002,
+      "loss": 0.617,
+      "step": 1080
+    },
+    {
+      "epoch": 7.077922077922078,
+      "grad_norm": 2.721569776535034,
+      "learning_rate": 0.0002,
+      "loss": 0.4511,
+      "step": 1090
+    },
+    {
+      "epoch": 7.142857142857143,
+      "grad_norm": 2.276289939880371,
+      "learning_rate": 0.0002,
+      "loss": 0.4225,
+      "step": 1100
+    },
+    {
+      "epoch": 7.207792207792208,
+      "grad_norm": 2.400822401046753,
+      "learning_rate": 0.0002,
+      "loss": 0.4147,
+      "step": 1110
+    },
+    {
+      "epoch": 7.2727272727272725,
+      "grad_norm": 3.4264252185821533,
+      "learning_rate": 0.0002,
+      "loss": 0.445,
+      "step": 1120
+    },
+    {
+      "epoch": 7.337662337662338,
+      "grad_norm": 1.8684237003326416,
+      "learning_rate": 0.0002,
+      "loss": 0.4551,
+      "step": 1130
+    },
+    {
+      "epoch": 7.402597402597403,
+      "grad_norm": 2.1616084575653076,
+      "learning_rate": 0.0002,
+      "loss": 0.4907,
+      "step": 1140
+    },
+    {
+      "epoch": 7.467532467532467,
+      "grad_norm": 2.7231879234313965,
+      "learning_rate": 0.0002,
+      "loss": 0.4833,
+      "step": 1150
+    },
+    {
+      "epoch": 7.532467532467533,
+      "grad_norm": 2.375119924545288,
+      "learning_rate": 0.0002,
+      "loss": 0.4904,
+      "step": 1160
+    },
+    {
+      "epoch": 7.597402597402597,
+      "grad_norm": 2.4779438972473145,
+      "learning_rate": 0.0002,
+      "loss": 0.4785,
+      "step": 1170
+    },
+    {
+      "epoch": 7.662337662337662,
+      "grad_norm": 2.5369439125061035,
+      "learning_rate": 0.0002,
+      "loss": 0.4899,
+      "step": 1180
+    },
+    {
+      "epoch": 7.7272727272727275,
+      "grad_norm": 3.769383430480957,
+      "learning_rate": 0.0002,
+      "loss": 0.4517,
+      "step": 1190
+    },
+    {
+      "epoch": 7.792207792207792,
+      "grad_norm": 2.8356423377990723,
+      "learning_rate": 0.0002,
+      "loss": 0.4765,
+      "step": 1200
+    },
+    {
+      "epoch": 7.857142857142857,
+      "grad_norm": 2.4924838542938232,
+      "learning_rate": 0.0002,
+      "loss": 0.4691,
+      "step": 1210
+    },
+    {
+      "epoch": 7.922077922077922,
+      "grad_norm": 3.033877372741699,
+      "learning_rate": 0.0002,
+      "loss": 0.5528,
+      "step": 1220
+    },
+    {
+      "epoch": 7.987012987012987,
+      "grad_norm": 3.1925995349884033,
+      "learning_rate": 0.0002,
+      "loss": 0.4878,
+      "step": 1230
+    },
+    {
+      "epoch": 8.0,
+      "eval_loss": 2.8629283905029297,
+      "eval_runtime": 181.6642,
+      "eval_samples_per_second": 2.934,
+      "eval_steps_per_second": 0.369,
+      "step": 1232
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1232,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5.40536633032704e+16,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:36b4f8ab66999c14764da42af0b1d3f87c70ee10ed09cb969b5621d5b7a29a6a
+size 5560

Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-154/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b15ba179d52b965afc887dcefa50ef00275d51afaf4596b021f286670ab61f5b
+size 109069176

Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-154/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b949ec7402119acf3f11237aba79cbce87de941357a1e7d96da79d0167f3e7df
+size 55532538

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dfd4673cebc5e0872b7629775f8f031fc1840ff362a0059c8a2f6b2e6839b924
+size 14244

Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-154/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:def102553842dd9e7c89cc4a95ebc89894127a7edcd68c689fae3bed677e6e12
+size 1064

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
+size 587404

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,146 @@

+{
+  "best_metric": 1.8057786226272583,
+  "best_model_checkpoint": "outputs-001/Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-154",
+  "epoch": 1.0,
+  "eval_steps": 10,
+  "global_step": 154,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.06493506493506493,
+      "grad_norm": 0.9856782555580139,
+      "learning_rate": 0.0002,
+      "loss": 2.593,
+      "step": 10
+    },
+    {
+      "epoch": 0.12987012987012986,
+      "grad_norm": 1.0205717086791992,
+      "learning_rate": 0.0002,
+      "loss": 2.229,
+      "step": 20
+    },
+    {
+      "epoch": 0.19480519480519481,
+      "grad_norm": 0.7780327200889587,
+      "learning_rate": 0.0002,
+      "loss": 2.0632,
+      "step": 30
+    },
+    {
+      "epoch": 0.2597402597402597,
+      "grad_norm": 0.7994171977043152,
+      "learning_rate": 0.0002,
+      "loss": 2.0682,
+      "step": 40
+    },
+    {
+      "epoch": 0.3246753246753247,
+      "grad_norm": 0.7783251404762268,
+      "learning_rate": 0.0002,
+      "loss": 2.1134,
+      "step": 50
+    },
+    {
+      "epoch": 0.38961038961038963,
+      "grad_norm": 0.7531919479370117,
+      "learning_rate": 0.0002,
+      "loss": 1.9359,
+      "step": 60
+    },
+    {
+      "epoch": 0.45454545454545453,
+      "grad_norm": 0.8411881327629089,
+      "learning_rate": 0.0002,
+      "loss": 1.8795,
+      "step": 70
+    },
+    {
+      "epoch": 0.5194805194805194,
+      "grad_norm": 0.7217594385147095,
+      "learning_rate": 0.0002,
+      "loss": 1.8772,
+      "step": 80
+    },
+    {
+      "epoch": 0.5844155844155844,
+      "grad_norm": 0.8530973792076111,
+      "learning_rate": 0.0002,
+      "loss": 1.906,
+      "step": 90
+    },
+    {
+      "epoch": 0.6493506493506493,
+      "grad_norm": 0.6478861570358276,
+      "learning_rate": 0.0002,
+      "loss": 1.8438,
+      "step": 100
+    },
+    {
+      "epoch": 0.7142857142857143,
+      "grad_norm": 0.6314818263053894,
+      "learning_rate": 0.0002,
+      "loss": 1.801,
+      "step": 110
+    },
+    {
+      "epoch": 0.7792207792207793,
+      "grad_norm": 0.6279414892196655,
+      "learning_rate": 0.0002,
+      "loss": 1.7333,
+      "step": 120
+    },
+    {
+      "epoch": 0.8441558441558441,
+      "grad_norm": 0.6663833856582642,
+      "learning_rate": 0.0002,
+      "loss": 1.7779,
+      "step": 130
+    },
+    {
+      "epoch": 0.9090909090909091,
+      "grad_norm": 0.5576409101486206,
+      "learning_rate": 0.0002,
+      "loss": 1.7262,
+      "step": 140
+    },
+    {
+      "epoch": 0.974025974025974,
+      "grad_norm": 0.6750475764274597,
+      "learning_rate": 0.0002,
+      "loss": 1.7602,
+      "step": 150
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 1.8057786226272583,
+      "eval_runtime": 186.6609,
+      "eval_samples_per_second": 2.855,
+      "eval_steps_per_second": 0.359,
+      "step": 154
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1232,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 6756707912908800.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:36b4f8ab66999c14764da42af0b1d3f87c70ee10ed09cb969b5621d5b7a29a6a
+size 5560

Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-308/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:84451b02b2f9ca9a1685a28f9cdec8c703a0fc6f1a4aa1da65bf19fee974c0f3
+size 109069176

Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-308/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:96691d13ebaaf50530b8292912e331bd17a4aae7f2cf0981bd1add29c2eead7a
+size 55532666

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d7a30edc965bc8b4b0ae846821608a54a6428c8dad6c4f817d834245b2dc188c
+size 14244

Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-308/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:da1e6ce4563fb7c39319687090a034259cf8a2d7fdc85c19a6037693718b36f9
+size 1064

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
+size 587404

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,259 @@

+{
+  "best_metric": 1.8057786226272583,
+  "best_model_checkpoint": "outputs-001/Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-702-sd-4/checkpoint-154",
+  "epoch": 2.0,
+  "eval_steps": 10,
+  "global_step": 308,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.06493506493506493,
+      "grad_norm": 0.9856782555580139,
+      "learning_rate": 0.0002,
+      "loss": 2.593,
+      "step": 10
+    },
+    {
+      "epoch": 0.12987012987012986,
+      "grad_norm": 1.0205717086791992,
+      "learning_rate": 0.0002,
+      "loss": 2.229,
+      "step": 20
+    },
+    {
+      "epoch": 0.19480519480519481,
+      "grad_norm": 0.7780327200889587,
+      "learning_rate": 0.0002,
+      "loss": 2.0632,
+      "step": 30
+    },
+    {
+      "epoch": 0.2597402597402597,
+      "grad_norm": 0.7994171977043152,
+      "learning_rate": 0.0002,
+      "loss": 2.0682,
+      "step": 40
+    },
+    {
+      "epoch": 0.3246753246753247,
+      "grad_norm": 0.7783251404762268,
+      "learning_rate": 0.0002,
+      "loss": 2.1134,
+      "step": 50
+    },
+    {
+      "epoch": 0.38961038961038963,
+      "grad_norm": 0.7531919479370117,
+      "learning_rate": 0.0002,
+      "loss": 1.9359,
+      "step": 60
+    },
+    {
+      "epoch": 0.45454545454545453,
+      "grad_norm": 0.8411881327629089,
+      "learning_rate": 0.0002,
+      "loss": 1.8795,
+      "step": 70
+    },
+    {
+      "epoch": 0.5194805194805194,
+      "grad_norm": 0.7217594385147095,
+      "learning_rate": 0.0002,
+      "loss": 1.8772,
+      "step": 80
+    },
+    {
+      "epoch": 0.5844155844155844,
+      "grad_norm": 0.8530973792076111,
+      "learning_rate": 0.0002,
+      "loss": 1.906,
+      "step": 90
+    },
+    {
+      "epoch": 0.6493506493506493,
+      "grad_norm": 0.6478861570358276,
+      "learning_rate": 0.0002,
+      "loss": 1.8438,
+      "step": 100
+    },
+    {
+      "epoch": 0.7142857142857143,
+      "grad_norm": 0.6314818263053894,
+      "learning_rate": 0.0002,
+      "loss": 1.801,
+      "step": 110
+    },
+    {
+      "epoch": 0.7792207792207793,
+      "grad_norm": 0.6279414892196655,
+      "learning_rate": 0.0002,
+      "loss": 1.7333,
+      "step": 120
+    },
+    {
+      "epoch": 0.8441558441558441,
+      "grad_norm": 0.6663833856582642,
+      "learning_rate": 0.0002,
+      "loss": 1.7779,
+      "step": 130
+    },
+    {
+      "epoch": 0.9090909090909091,
+      "grad_norm": 0.5576409101486206,
+      "learning_rate": 0.0002,
+      "loss": 1.7262,
+      "step": 140
+    },
+    {
+      "epoch": 0.974025974025974,
+      "grad_norm": 0.6750475764274597,
+      "learning_rate": 0.0002,
+      "loss": 1.7602,
+      "step": 150
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 1.8057786226272583,
+      "eval_runtime": 186.6609,
+      "eval_samples_per_second": 2.855,
+      "eval_steps_per_second": 0.359,
+      "step": 154
+    },
+    {
+      "epoch": 1.0389610389610389,
+      "grad_norm": 0.5803011655807495,
+      "learning_rate": 0.0002,
+      "loss": 1.6961,
+      "step": 160
+    },
+    {
+      "epoch": 1.103896103896104,
+      "grad_norm": 0.6350723505020142,
+      "learning_rate": 0.0002,
+      "loss": 1.7369,
+      "step": 170
+    },
+    {
+      "epoch": 1.1688311688311688,
+      "grad_norm": 0.7430880069732666,
+      "learning_rate": 0.0002,
+      "loss": 1.6487,
+      "step": 180
+    },
+    {
+      "epoch": 1.2337662337662338,
+      "grad_norm": 0.7743862271308899,
+      "learning_rate": 0.0002,
+      "loss": 1.6922,
+      "step": 190
+    },
+    {
+      "epoch": 1.2987012987012987,
+      "grad_norm": 0.644690752029419,
+      "learning_rate": 0.0002,
+      "loss": 1.6812,
+      "step": 200
+    },
+    {
+      "epoch": 1.3636363636363638,
+      "grad_norm": 0.6815120577812195,
+      "learning_rate": 0.0002,
+      "loss": 1.6846,
+      "step": 210
+    },
+    {
+      "epoch": 1.4285714285714286,
+      "grad_norm": 0.6068838238716125,
+      "learning_rate": 0.0002,
+      "loss": 1.7777,
+      "step": 220
+    },
+    {
+      "epoch": 1.4935064935064934,
+      "grad_norm": 0.6361706256866455,
+      "learning_rate": 0.0002,
+      "loss": 1.6935,
+      "step": 230
+    },
+    {
+      "epoch": 1.5584415584415585,
+      "grad_norm": 0.7081064581871033,
+      "learning_rate": 0.0002,
+      "loss": 1.7301,
+      "step": 240
+    },
+    {
+      "epoch": 1.6233766233766234,
+      "grad_norm": 0.638526976108551,
+      "learning_rate": 0.0002,
+      "loss": 1.6151,
+      "step": 250
+    },
+    {
+      "epoch": 1.6883116883116882,
+      "grad_norm": 0.6861023306846619,
+      "learning_rate": 0.0002,
+      "loss": 1.6573,
+      "step": 260
+    },
+    {
+      "epoch": 1.7532467532467533,
+      "grad_norm": 0.75590580701828,
+      "learning_rate": 0.0002,
+      "loss": 1.6843,
+      "step": 270
+    },
+    {
+      "epoch": 1.8181818181818183,
+      "grad_norm": 0.7851096987724304,
+      "learning_rate": 0.0002,
+      "loss": 1.7069,
+      "step": 280
+    },
+    {
+      "epoch": 1.883116883116883,
+      "grad_norm": 0.6292237043380737,
+      "learning_rate": 0.0002,
+      "loss": 1.7264,
+      "step": 290
+    },
+    {
+      "epoch": 1.948051948051948,
+      "grad_norm": 0.6913678646087646,
+      "learning_rate": 0.0002,
+      "loss": 1.7089,
+      "step": 300
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 1.8135713338851929,
+      "eval_runtime": 187.0663,
+      "eval_samples_per_second": 2.849,
+      "eval_steps_per_second": 0.358,
+      "step": 308
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1232,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.35134158258176e+16,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}