CharlesLi commited on
Commit
b35781a
·
verified ·
1 Parent(s): 4061c18

Model save

Browse files
README.md CHANGED
@@ -1,12 +1,8 @@
1
  ---
2
  library_name: transformers
3
- license: llama2
4
- base_model: meta-llama/Llama-2-7b-chat-hf
5
  tags:
6
- - alignment-handbook
7
- - trl
8
- - sft
9
- - generated_from_trainer
10
  - trl
11
  - sft
12
  - generated_from_trainer
@@ -20,9 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
20
 
21
  # mistral_o1_005_full
22
 
23
- This model is a fine-tuned version of [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) on the None dataset.
24
- It achieves the following results on the evaluation set:
25
- - Loss: 0.8031
26
 
27
  ## Model description
28
 
 
1
  ---
2
  library_name: transformers
3
+ license: apache-2.0
4
+ base_model: mistralai/Mistral-7B-Instruct-v0.1
5
  tags:
 
 
 
 
6
  - trl
7
  - sft
8
  - generated_from_trainer
 
16
 
17
  # mistral_o1_005_full
18
 
19
+ This model is a fine-tuned version of [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) on the None dataset.
 
 
20
 
21
  ## Model description
22
 
all_results.json CHANGED
@@ -1,14 +1,9 @@
1
  {
2
  "epoch": 0.9565217391304348,
3
- "eval_loss": 0.803080677986145,
4
- "eval_runtime": 1.3886,
5
- "eval_samples": 20,
6
- "eval_samples_per_second": 14.403,
7
- "eval_steps_per_second": 1.44,
8
- "total_flos": 1083185692672.0,
9
- "train_loss": 0.9741405790502374,
10
- "train_runtime": 62.2828,
11
  "train_samples": 368,
12
- "train_samples_per_second": 5.909,
13
- "train_steps_per_second": 0.177
14
  }
 
1
  {
2
  "epoch": 0.9565217391304348,
3
+ "total_flos": 1081332727808.0,
4
+ "train_loss": 0.8119967471469532,
5
+ "train_runtime": 65.397,
 
 
 
 
 
6
  "train_samples": 368,
7
+ "train_samples_per_second": 5.627,
8
+ "train_steps_per_second": 0.168
9
  }
config.json CHANGED
@@ -1,29 +1,27 @@
1
  {
2
- "_name_or_path": "meta-llama/Llama-2-7b-chat-hf",
3
  "architectures": [
4
- "LlamaForCausalLM"
5
  ],
6
- "attention_bias": false,
7
  "attention_dropout": 0.0,
8
  "bos_token_id": 1,
9
  "eos_token_id": 2,
 
10
  "hidden_act": "silu",
11
  "hidden_size": 4096,
12
  "initializer_range": 0.02,
13
- "intermediate_size": 11008,
14
- "max_position_embeddings": 4096,
15
- "mlp_bias": false,
16
- "model_type": "llama",
17
  "num_attention_heads": 32,
18
  "num_hidden_layers": 32,
19
- "num_key_value_heads": 32,
20
- "pretraining_tp": 1,
21
  "rms_norm_eps": 1e-05,
22
- "rope_scaling": null,
23
  "rope_theta": 10000.0,
 
24
  "tie_word_embeddings": false,
25
  "torch_dtype": "bfloat16",
26
  "transformers_version": "4.44.2",
27
- "use_cache": true,
28
  "vocab_size": 32000
29
  }
 
1
  {
2
+ "_name_or_path": "mistralai/Mistral-7B-Instruct-v0.1",
3
  "architectures": [
4
+ "MistralForCausalLM"
5
  ],
 
6
  "attention_dropout": 0.0,
7
  "bos_token_id": 1,
8
  "eos_token_id": 2,
9
+ "head_dim": 128,
10
  "hidden_act": "silu",
11
  "hidden_size": 4096,
12
  "initializer_range": 0.02,
13
+ "intermediate_size": 14336,
14
+ "max_position_embeddings": 32768,
15
+ "model_type": "mistral",
 
16
  "num_attention_heads": 32,
17
  "num_hidden_layers": 32,
18
+ "num_key_value_heads": 8,
 
19
  "rms_norm_eps": 1e-05,
 
20
  "rope_theta": 10000.0,
21
+ "sliding_window": 4096,
22
  "tie_word_embeddings": false,
23
  "torch_dtype": "bfloat16",
24
  "transformers_version": "4.44.2",
25
+ "use_cache": false,
26
  "vocab_size": 32000
27
  }
generation_config.json CHANGED
@@ -1,10 +1,6 @@
1
  {
 
2
  "bos_token_id": 1,
3
- "do_sample": true,
4
  "eos_token_id": 2,
5
- "max_length": 4096,
6
- "pad_token_id": 0,
7
- "temperature": 0.6,
8
- "top_p": 0.9,
9
  "transformers_version": "4.44.2"
10
  }
 
1
  {
2
+ "_from_model_config": true,
3
  "bos_token_id": 1,
 
4
  "eos_token_id": 2,
 
 
 
 
5
  "transformers_version": "4.44.2"
6
  }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e70093c66dc0e8fc3ad279b524c73b67d8a2d1e6229958c69376332271e278c0
3
- size 4938985352
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1a91f7e17b20f6f785ac3121e63de20e0312735ad4e7b6edee5b125c02ff5b3
3
+ size 4943162336
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b824e7aab109170f398e9bd2ea075df9a5419de7d03a504a600b3e897ee2679c
3
- size 4947390880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8fd5c8d88cf7ed1822c75cb15cb4428175f2a7dea4128d41e4ff77d5acf4bd0f
3
+ size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:45cac3041405070e7b71ae58d7677d06f46602f6e36f5917354b43de23ac7674
3
- size 3590488816
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dcbd48a79233d29350e3b29bca8ae69f17ed2ac8fee107b5a408941e50c8ae25
3
+ size 4540516344
model.safetensors.index.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "metadata": {
3
- "total_size": 13476831232
4
  },
5
  "weight_map": {
6
  "lm_head.weight": "model-00003-of-00003.safetensors",
@@ -23,24 +23,24 @@
23
  "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
24
  "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
25
  "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
26
- "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors",
27
- "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
28
  "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
29
  "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
30
- "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
31
  "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
32
  "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
33
  "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
34
  "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
35
  "model.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors",
36
  "model.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
37
- "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
38
  "model.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
39
  "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
40
- "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
41
- "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
42
- "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
43
- "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
44
  "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors",
45
  "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
46
  "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
@@ -140,24 +140,24 @@
140
  "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
141
  "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
142
  "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
143
- "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors",
144
- "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
145
- "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
146
- "model.layers.22.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
147
- "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
148
  "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
149
  "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
150
  "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
151
  "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
152
  "model.layers.23.input_layernorm.weight": "model-00003-of-00003.safetensors",
153
  "model.layers.23.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
154
- "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
155
- "model.layers.23.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
156
  "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
157
- "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
158
- "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
159
- "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
160
- "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
161
  "model.layers.24.input_layernorm.weight": "model-00003-of-00003.safetensors",
162
  "model.layers.24.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
163
  "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
 
1
  {
2
  "metadata": {
3
+ "total_size": 14483464192
4
  },
5
  "weight_map": {
6
  "lm_head.weight": "model-00003-of-00003.safetensors",
 
23
  "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
24
  "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
25
  "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
26
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors",
27
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
28
  "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
29
  "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
30
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
31
  "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
32
  "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
33
  "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
34
  "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
35
  "model.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors",
36
  "model.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
37
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
38
  "model.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
39
  "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
40
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
41
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
42
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
43
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
44
  "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors",
45
  "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
46
  "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
 
140
  "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
141
  "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
142
  "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
143
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00003.safetensors",
144
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
145
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
146
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
147
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
148
  "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
149
  "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
150
  "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
151
  "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
152
  "model.layers.23.input_layernorm.weight": "model-00003-of-00003.safetensors",
153
  "model.layers.23.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
154
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
155
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
156
  "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
157
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
158
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
159
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
160
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
161
  "model.layers.24.input_layernorm.weight": "model-00003-of-00003.safetensors",
162
  "model.layers.24.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
163
  "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
runs/Jan07_20-04-55_dgx-a100-12/events.out.tfevents.1736277019.dgx-a100-12.2303065.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1bcc9d740357b0f3347fa734cd2c77371d7be89b3be41f77c359bc5e4bfc0c22
3
+ size 6155
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
- size 499723
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
3
+ size 493443
tokenizer_config.json CHANGED
@@ -28,6 +28,7 @@
28
  "special": true
29
  }
30
  },
 
31
  "bos_token": "<s>",
32
  "chat_template": "{{ bos_token + 'System: ' + (messages[0]['content'] | trim + '\n\n' if messages[0]['role'] == 'system' else '') }}{% set messages = messages[1:] if messages[0]['role'] == 'system' else messages %}{% for message in messages %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') if (message['role'] == 'user') != (loop.index0 % 2 == 0) else '' }}{{ '[INST] ' + message['content'] | trim + ' [/INST]' if message['role'] == 'user' else ' ' + message['content'] | trim + eos_token }}{% endfor %}",
33
  "clean_up_tokenization_spaces": false,
@@ -35,8 +36,8 @@
35
  "legacy": false,
36
  "model_max_length": 2048,
37
  "pad_token": "</s>",
38
- "padding_side": "right",
39
  "sp_model_kwargs": {},
 
40
  "tokenizer_class": "LlamaTokenizer",
41
  "unk_token": "<unk>",
42
  "use_default_system_prompt": false
 
28
  "special": true
29
  }
30
  },
31
+ "additional_special_tokens": [],
32
  "bos_token": "<s>",
33
  "chat_template": "{{ bos_token + 'System: ' + (messages[0]['content'] | trim + '\n\n' if messages[0]['role'] == 'system' else '') }}{% set messages = messages[1:] if messages[0]['role'] == 'system' else messages %}{% for message in messages %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') if (message['role'] == 'user') != (loop.index0 % 2 == 0) else '' }}{{ '[INST] ' + message['content'] | trim + ' [/INST]' if message['role'] == 'user' else ' ' + message['content'] | trim + eos_token }}{% endfor %}",
34
  "clean_up_tokenization_spaces": false,
 
36
  "legacy": false,
37
  "model_max_length": 2048,
38
  "pad_token": "</s>",
 
39
  "sp_model_kwargs": {},
40
+ "spaces_between_special_tokens": false,
41
  "tokenizer_class": "LlamaTokenizer",
42
  "unk_token": "<unk>",
43
  "use_default_system_prompt": false
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 0.9565217391304348,
3
- "total_flos": 1083185692672.0,
4
- "train_loss": 0.9741405790502374,
5
- "train_runtime": 62.2828,
6
  "train_samples": 368,
7
- "train_samples_per_second": 5.909,
8
- "train_steps_per_second": 0.177
9
  }
 
1
  {
2
  "epoch": 0.9565217391304348,
3
+ "total_flos": 1081332727808.0,
4
+ "train_loss": 0.8119967471469532,
5
+ "train_runtime": 65.397,
6
  "train_samples": 368,
7
+ "train_samples_per_second": 5.627,
8
+ "train_steps_per_second": 0.168
9
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b251f1c82b0822b8ab316a9cf5e968795149fba290f76e09fba4db75d272f73c
3
  size 6968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78a20881c257c3665ecb1a6b3c1f24256f69f94481025b9f4fca67cd1f94d43d
3
  size 6968