assskelad commited on
Commit
552dcb6
·
verified ·
1 Parent(s): cac2ba7

Model save

Browse files
README.md ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: HuggingFaceTB/SmolLM2-360M
3
+ library_name: transformers
4
+ model_name: smollm2-360M-sft_SmallThoughts
5
+ tags:
6
+ - generated_from_trainer
7
+ - trl
8
+ - sft
9
+ licence: license
10
+ ---
11
+
12
+ # Model Card for smollm2-360M-sft_SmallThoughts
13
+
14
+ This model is a fine-tuned version of [HuggingFaceTB/SmolLM2-360M](https://huggingface.co/HuggingFaceTB/SmolLM2-360M).
15
+ It has been trained using [TRL](https://github.com/huggingface/trl).
16
+
17
+ ## Quick start
18
+
19
+ ```python
20
+ from transformers import pipeline
21
+
22
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
23
+ generator = pipeline("text-generation", model="assskelad/smollm2-360M-sft_SmallThoughts", device="cuda")
24
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
25
+ print(output["generated_text"])
26
+ ```
27
+
28
+ ## Training procedure
29
+
30
+
31
+
32
+
33
+ This model was trained with SFT.
34
+
35
+ ### Framework versions
36
+
37
+ - TRL: 0.15.2
38
+ - Transformers: 4.49.0
39
+ - Pytorch: 2.6.0+cu126
40
+ - Datasets: 3.3.2
41
+ - Tokenizers: 0.21.0
42
+
43
+ ## Citations
44
+
45
+
46
+
47
+ Cite TRL as:
48
+
49
+ ```bibtex
50
+ @misc{vonwerra2022trl,
51
+ title = {{TRL: Transformer Reinforcement Learning}},
52
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin GallouГ©dec},
53
+ year = 2020,
54
+ journal = {GitHub repository},
55
+ publisher = {GitHub},
56
+ howpublished = {\url{https://github.com/huggingface/trl}}
57
+ }
58
+ ```
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 6258600061401600.0,
3
+ "train_loss": 0.5569522526019659,
4
+ "train_runtime": 905.7572,
5
+ "train_samples": 2500,
6
+ "train_samples_per_second": 5.52,
7
+ "train_steps_per_second": 0.344
8
+ }
config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "HuggingFaceTB/SmolLM2-360M",
3
+ "architectures": [
4
+ "LlamaForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 0,
9
+ "eos_token_id": 0,
10
+ "head_dim": 64,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 960,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 2560,
15
+ "is_llama_config": true,
16
+ "max_position_embeddings": 8192,
17
+ "mlp_bias": false,
18
+ "model_type": "llama",
19
+ "num_attention_heads": 15,
20
+ "num_hidden_layers": 32,
21
+ "num_key_value_heads": 5,
22
+ "pretraining_tp": 1,
23
+ "rms_norm_eps": 1e-05,
24
+ "rope_interleaved": false,
25
+ "rope_scaling": null,
26
+ "rope_theta": 100000,
27
+ "tie_word_embeddings": true,
28
+ "torch_dtype": "bfloat16",
29
+ "transformers_version": "4.49.0",
30
+ "use_cache": false,
31
+ "vocab_size": 49152
32
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": 0,
5
+ "transformers_version": "4.49.0"
6
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ea50847015dbad7e4352cdf83beb1af3f67f0b33d3629bad0ba0b3a21550548
3
+ size 723674912
runs/Mar10_14-43-34_DESKTOP-T04P4ED/events.out.tfevents.1741607024.DESKTOP-T04P4ED.14000.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a79b53b014e21a281ce33dd118065a457a17b0d381eb8d9cd4add4ff01e92145
3
+ size 21839
runs/Mar10_17-22-19_DESKTOP-T04P4ED/events.out.tfevents.1741616667.DESKTOP-T04P4ED.16240.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08743dd8fa8cce7d0bcf33c49ef6b8378aebffa192aedd398acf7f57b7e5c8e4
3
+ size 6367
runs/Mar10_17-26-23_DESKTOP-T04P4ED/events.out.tfevents.1741616831.DESKTOP-T04P4ED.15436.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd88bcbf7543100593252c27bd30cc5c466c576145bf2d436366ed66a0e9e601
3
+ size 15914
runs/Mar10_17-48-12_DESKTOP-T04P4ED/events.out.tfevents.1741618129.DESKTOP-T04P4ED.11860.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b821f1bb0f921f9d2487120fd3cf900df1a4ad157ce038461c526baec923fab
3
+ size 15928
special_tokens_map.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|endoftext|>",
4
+ "<|im_start|>",
5
+ "<|im_end|>",
6
+ "<repo_name>",
7
+ "<reponame>",
8
+ "<file_sep>",
9
+ "<filename>",
10
+ "<gh_stars>",
11
+ "<issue_start>",
12
+ "<issue_comment>",
13
+ "<issue_closed>",
14
+ "<jupyter_start>",
15
+ "<jupyter_text>",
16
+ "<jupyter_code>",
17
+ "<jupyter_output>",
18
+ "<jupyter_script>",
19
+ "<empty_output>"
20
+ ],
21
+ "bos_token": {
22
+ "content": "<|endoftext|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false
27
+ },
28
+ "eos_token": {
29
+ "content": "<|endoftext|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false
34
+ },
35
+ "pad_token": "<|endoftext|>",
36
+ "unk_token": {
37
+ "content": "<|endoftext|>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false
42
+ }
43
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<repo_name>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "4": {
37
+ "content": "<reponame>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "5": {
45
+ "content": "<file_sep>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "6": {
53
+ "content": "<filename>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "7": {
61
+ "content": "<gh_stars>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "8": {
69
+ "content": "<issue_start>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "9": {
77
+ "content": "<issue_comment>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "10": {
85
+ "content": "<issue_closed>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "11": {
93
+ "content": "<jupyter_start>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "12": {
101
+ "content": "<jupyter_text>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "13": {
109
+ "content": "<jupyter_code>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "14": {
117
+ "content": "<jupyter_output>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": true
123
+ },
124
+ "15": {
125
+ "content": "<jupyter_script>",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": true
131
+ },
132
+ "16": {
133
+ "content": "<empty_output>",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": true
139
+ }
140
+ },
141
+ "additional_special_tokens": [
142
+ "<|endoftext|>",
143
+ "<|im_start|>",
144
+ "<|im_end|>",
145
+ "<repo_name>",
146
+ "<reponame>",
147
+ "<file_sep>",
148
+ "<filename>",
149
+ "<gh_stars>",
150
+ "<issue_start>",
151
+ "<issue_comment>",
152
+ "<issue_closed>",
153
+ "<jupyter_start>",
154
+ "<jupyter_text>",
155
+ "<jupyter_code>",
156
+ "<jupyter_output>",
157
+ "<jupyter_script>",
158
+ "<empty_output>"
159
+ ],
160
+ "bos_token": "<|endoftext|>",
161
+ "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
162
+ "clean_up_tokenization_spaces": false,
163
+ "eos_token": "<|endoftext|>",
164
+ "extra_special_tokens": {},
165
+ "model_max_length": 8192,
166
+ "pad_token": "<|endoftext|>",
167
+ "tokenizer_class": "GPT2Tokenizer",
168
+ "unk_token": "<|endoftext|>",
169
+ "vocab_size": 49152
170
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 6258600061401600.0,
3
+ "train_loss": 0.5569522526019659,
4
+ "train_runtime": 905.7572,
5
+ "train_samples": 2500,
6
+ "train_samples_per_second": 5.52,
7
+ "train_steps_per_second": 0.344
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,565 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.8502547144889832,
3
+ "best_model_checkpoint": "data/smollm2-1.7B-sft\\checkpoint-142",
4
+ "epoch": 1.9984,
5
+ "eval_steps": 500,
6
+ "global_step": 312,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.035398230088495575,
13
+ "grad_norm": 8.5625,
14
+ "learning_rate": 0.0001724137931034483,
15
+ "loss": 2.2924,
16
+ "mean_token_accuracy": 0.5223099514842033,
17
+ "step": 5
18
+ },
19
+ {
20
+ "epoch": 0.07079646017699115,
21
+ "grad_norm": 1.1796875,
22
+ "learning_rate": 0.0003448275862068966,
23
+ "loss": 1.07,
24
+ "mean_token_accuracy": 0.7215210050344467,
25
+ "step": 10
26
+ },
27
+ {
28
+ "epoch": 0.10619469026548672,
29
+ "grad_norm": 0.88671875,
30
+ "learning_rate": 0.0005172413793103448,
31
+ "loss": 0.8398,
32
+ "mean_token_accuracy": 0.7689230531454087,
33
+ "step": 15
34
+ },
35
+ {
36
+ "epoch": 0.1415929203539823,
37
+ "grad_norm": 0.96484375,
38
+ "learning_rate": 0.0006896551724137932,
39
+ "loss": 0.8466,
40
+ "mean_token_accuracy": 0.7690734684467315,
41
+ "step": 20
42
+ },
43
+ {
44
+ "epoch": 0.17699115044247787,
45
+ "grad_norm": 1.0859375,
46
+ "learning_rate": 0.0008620689655172414,
47
+ "loss": 0.8315,
48
+ "mean_token_accuracy": 0.7698834031820297,
49
+ "step": 25
50
+ },
51
+ {
52
+ "epoch": 0.21238938053097345,
53
+ "grad_norm": 0.8984375,
54
+ "learning_rate": 0.000999961452773888,
55
+ "loss": 0.8657,
56
+ "mean_token_accuracy": 0.7694115966558457,
57
+ "step": 30
58
+ },
59
+ {
60
+ "epoch": 0.24778761061946902,
61
+ "grad_norm": 0.89453125,
62
+ "learning_rate": 0.0009986129238305635,
63
+ "loss": 0.9508,
64
+ "mean_token_accuracy": 0.7505242705345154,
65
+ "step": 35
66
+ },
67
+ {
68
+ "epoch": 0.2831858407079646,
69
+ "grad_norm": 0.74609375,
70
+ "learning_rate": 0.0009953429730181654,
71
+ "loss": 0.9422,
72
+ "mean_token_accuracy": 0.7508314579725266,
73
+ "step": 40
74
+ },
75
+ {
76
+ "epoch": 0.3185840707964602,
77
+ "grad_norm": 0.84765625,
78
+ "learning_rate": 0.0009901642012034213,
79
+ "loss": 0.9568,
80
+ "mean_token_accuracy": 0.7477447241544724,
81
+ "step": 45
82
+ },
83
+ {
84
+ "epoch": 0.35398230088495575,
85
+ "grad_norm": 0.75390625,
86
+ "learning_rate": 0.0009830965649597454,
87
+ "loss": 0.9577,
88
+ "mean_token_accuracy": 0.7479827046394348,
89
+ "step": 50
90
+ },
91
+ {
92
+ "epoch": 0.3893805309734513,
93
+ "grad_norm": 0.69921875,
94
+ "learning_rate": 0.0009741672996639047,
95
+ "loss": 0.9318,
96
+ "mean_token_accuracy": 0.7587000578641891,
97
+ "step": 55
98
+ },
99
+ {
100
+ "epoch": 0.4247787610619469,
101
+ "grad_norm": 0.83203125,
102
+ "learning_rate": 0.0009634108145435665,
103
+ "loss": 0.9259,
104
+ "mean_token_accuracy": 0.7593102544546128,
105
+ "step": 60
106
+ },
107
+ {
108
+ "epoch": 0.46017699115044247,
109
+ "grad_norm": 0.7109375,
110
+ "learning_rate": 0.0009508685600801703,
111
+ "loss": 0.8877,
112
+ "mean_token_accuracy": 0.768345057964325,
113
+ "step": 65
114
+ },
115
+ {
116
+ "epoch": 0.49557522123893805,
117
+ "grad_norm": 0.66015625,
118
+ "learning_rate": 0.0009365888682780861,
119
+ "loss": 0.9253,
120
+ "mean_token_accuracy": 0.7625227481126785,
121
+ "step": 70
122
+ },
123
+ {
124
+ "epoch": 0.5309734513274337,
125
+ "grad_norm": 0.61328125,
126
+ "learning_rate": 0.0009206267664155906,
127
+ "loss": 0.912,
128
+ "mean_token_accuracy": 0.7615309178829193,
129
+ "step": 75
130
+ },
131
+ {
132
+ "epoch": 0.5663716814159292,
133
+ "grad_norm": 0.67578125,
134
+ "learning_rate": 0.0009030437649953789,
135
+ "loss": 0.9242,
136
+ "mean_token_accuracy": 0.7563268959522247,
137
+ "step": 80
138
+ },
139
+ {
140
+ "epoch": 0.6017699115044248,
141
+ "grad_norm": 0.6171875,
142
+ "learning_rate": 0.0008839076207117484,
143
+ "loss": 0.8975,
144
+ "mean_token_accuracy": 0.7675409287214279,
145
+ "step": 85
146
+ },
147
+ {
148
+ "epoch": 0.6371681415929203,
149
+ "grad_norm": 0.60546875,
150
+ "learning_rate": 0.0008632920753478719,
151
+ "loss": 0.8901,
152
+ "mean_token_accuracy": 0.7678898781538009,
153
+ "step": 90
154
+ },
155
+ {
156
+ "epoch": 0.672566371681416,
157
+ "grad_norm": 0.59765625,
158
+ "learning_rate": 0.0008412765716093271,
159
+ "loss": 0.8868,
160
+ "mean_token_accuracy": 0.7646951735019684,
161
+ "step": 95
162
+ },
163
+ {
164
+ "epoch": 0.7079646017699115,
165
+ "grad_norm": 0.6015625,
166
+ "learning_rate": 0.0008179459469889268,
167
+ "loss": 0.9314,
168
+ "mean_token_accuracy": 0.7571736365556717,
169
+ "step": 100
170
+ },
171
+ {
172
+ "epoch": 0.7433628318584071,
173
+ "grad_norm": 0.6171875,
174
+ "learning_rate": 0.0007933901068425538,
175
+ "loss": 0.8961,
176
+ "mean_token_accuracy": 0.7628611207008362,
177
+ "step": 105
178
+ },
179
+ {
180
+ "epoch": 0.7787610619469026,
181
+ "grad_norm": 0.5859375,
182
+ "learning_rate": 0.000767703677935813,
183
+ "loss": 0.891,
184
+ "mean_token_accuracy": 0.7617514222860337,
185
+ "step": 110
186
+ },
187
+ {
188
+ "epoch": 0.8141592920353983,
189
+ "grad_norm": 0.69140625,
190
+ "learning_rate": 0.000740985643796569,
191
+ "loss": 0.9057,
192
+ "mean_token_accuracy": 0.7621735870838166,
193
+ "step": 115
194
+ },
195
+ {
196
+ "epoch": 0.8495575221238938,
197
+ "grad_norm": 0.61328125,
198
+ "learning_rate": 0.0007133389632785542,
199
+ "loss": 0.8874,
200
+ "mean_token_accuracy": 0.7660593271255494,
201
+ "step": 120
202
+ },
203
+ {
204
+ "epoch": 0.8849557522123894,
205
+ "grad_norm": 0.5703125,
206
+ "learning_rate": 0.0006848701738059226,
207
+ "loss": 0.8441,
208
+ "mean_token_accuracy": 0.7740032315254212,
209
+ "step": 125
210
+ },
211
+ {
212
+ "epoch": 0.9203539823008849,
213
+ "grad_norm": 0.59375,
214
+ "learning_rate": 0.0006556889808276594,
215
+ "loss": 0.8767,
216
+ "mean_token_accuracy": 0.7661084860563279,
217
+ "step": 130
218
+ },
219
+ {
220
+ "epoch": 0.9557522123893806,
221
+ "grad_norm": 0.66796875,
222
+ "learning_rate": 0.0006259078350639009,
223
+ "loss": 0.8962,
224
+ "mean_token_accuracy": 0.7660992830991745,
225
+ "step": 135
226
+ },
227
+ {
228
+ "epoch": 0.9911504424778761,
229
+ "grad_norm": 0.5390625,
230
+ "learning_rate": 0.0005956414991732583,
231
+ "loss": 0.8515,
232
+ "mean_token_accuracy": 0.7711150646209717,
233
+ "step": 140
234
+ },
235
+ {
236
+ "epoch": 1.0,
237
+ "eval_loss": 0.8502547144889832,
238
+ "eval_mean_token_accuracy": 0.7765891637120929,
239
+ "eval_runtime": 2.1657,
240
+ "eval_samples_per_second": 54.949,
241
+ "eval_steps_per_second": 13.853,
242
+ "step": 142
243
+ },
244
+ {
245
+ "epoch": 0.928,
246
+ "grad_norm": 1.1484375,
247
+ "learning_rate": 0.000649163469284578,
248
+ "loss": 1.5121,
249
+ "mean_token_accuracy": 0.6798420002063116,
250
+ "step": 145
251
+ },
252
+ {
253
+ "epoch": 0.96,
254
+ "grad_norm": 0.6484375,
255
+ "learning_rate": 0.000622170203068947,
256
+ "loss": 1.1683,
257
+ "mean_token_accuracy": 0.7228868573904037,
258
+ "step": 150
259
+ },
260
+ {
261
+ "epoch": 0.992,
262
+ "grad_norm": 0.6171875,
263
+ "learning_rate": 0.0005947925441958392,
264
+ "loss": 1.1881,
265
+ "mean_token_accuracy": 0.7217638492584229,
266
+ "step": 155
267
+ },
268
+ {
269
+ "epoch": 0.9984,
270
+ "eval_loss": 1.1636234521865845,
271
+ "eval_mean_token_accuracy": 0.7250353273223428,
272
+ "eval_runtime": 70.0038,
273
+ "eval_samples_per_second": 14.285,
274
+ "eval_steps_per_second": 3.571,
275
+ "step": 156
276
+ },
277
+ {
278
+ "epoch": 1.0256,
279
+ "grad_norm": 0.609375,
280
+ "learning_rate": 0.0005671166329088278,
281
+ "loss": 1.3735,
282
+ "mean_token_accuracy": 0.7168766930699348,
283
+ "step": 160
284
+ },
285
+ {
286
+ "epoch": 1.0576,
287
+ "grad_norm": 0.578125,
288
+ "learning_rate": 0.0005392295478639225,
289
+ "loss": 1.0769,
290
+ "mean_token_accuracy": 0.740374532341957,
291
+ "step": 165
292
+ },
293
+ {
294
+ "epoch": 1.0896,
295
+ "grad_norm": 0.5,
296
+ "learning_rate": 0.0005112190321479025,
297
+ "loss": 1.0308,
298
+ "mean_token_accuracy": 0.7460691720247269,
299
+ "step": 170
300
+ },
301
+ {
302
+ "epoch": 1.1216,
303
+ "grad_norm": 0.4609375,
304
+ "learning_rate": 0.0004831732172061032,
305
+ "loss": 1.1521,
306
+ "mean_token_accuracy": 0.7258656650781632,
307
+ "step": 175
308
+ },
309
+ {
310
+ "epoch": 1.1536,
311
+ "grad_norm": 0.466796875,
312
+ "learning_rate": 0.0004551803455482833,
313
+ "loss": 1.0695,
314
+ "mean_token_accuracy": 0.7392740726470948,
315
+ "step": 180
316
+ },
317
+ {
318
+ "epoch": 1.1856,
319
+ "grad_norm": 0.3671875,
320
+ "learning_rate": 0.0004273284931050438,
321
+ "loss": 1.0262,
322
+ "mean_token_accuracy": 0.7486750066280365,
323
+ "step": 185
324
+ },
325
+ {
326
+ "epoch": 1.2176,
327
+ "grad_norm": 0.439453125,
328
+ "learning_rate": 0.00039970529210836363,
329
+ "loss": 1.1084,
330
+ "mean_token_accuracy": 0.7333114802837372,
331
+ "step": 190
332
+ },
333
+ {
334
+ "epoch": 1.2496,
335
+ "grad_norm": 0.431640625,
336
+ "learning_rate": 0.00037239765536817873,
337
+ "loss": 0.9957,
338
+ "mean_token_accuracy": 0.7531520247459411,
339
+ "step": 195
340
+ },
341
+ {
342
+ "epoch": 1.2816,
343
+ "grad_norm": 0.369140625,
344
+ "learning_rate": 0.00034549150281252633,
345
+ "loss": 0.9807,
346
+ "mean_token_accuracy": 0.7547169893980026,
347
+ "step": 200
348
+ },
349
+ {
350
+ "epoch": 1.3136,
351
+ "grad_norm": 0.35546875,
352
+ "learning_rate": 0.000319071491151664,
353
+ "loss": 0.9396,
354
+ "mean_token_accuracy": 0.7626697480678558,
355
+ "step": 205
356
+ },
357
+ {
358
+ "epoch": 1.3456000000000001,
359
+ "grad_norm": 0.40234375,
360
+ "learning_rate": 0.00029322074751673977,
361
+ "loss": 0.9961,
362
+ "mean_token_accuracy": 0.753812238574028,
363
+ "step": 210
364
+ },
365
+ {
366
+ "epoch": 1.3776,
367
+ "grad_norm": 0.380859375,
368
+ "learning_rate": 0.000268020607911083,
369
+ "loss": 0.9953,
370
+ "mean_token_accuracy": 0.7547316879034043,
371
+ "step": 215
372
+ },
373
+ {
374
+ "epoch": 1.4096,
375
+ "grad_norm": 0.3515625,
376
+ "learning_rate": 0.000243550361297047,
377
+ "loss": 0.975,
378
+ "mean_token_accuracy": 0.7559601426124573,
379
+ "step": 220
380
+ },
381
+ {
382
+ "epoch": 1.4416,
383
+ "grad_norm": 0.412109375,
384
+ "learning_rate": 0.00021988700012359863,
385
+ "loss": 1.0118,
386
+ "mean_token_accuracy": 0.7498785346746445,
387
+ "step": 225
388
+ },
389
+ {
390
+ "epoch": 1.4736,
391
+ "grad_norm": 0.35546875,
392
+ "learning_rate": 0.0001971049780795901,
393
+ "loss": 0.9645,
394
+ "mean_token_accuracy": 0.757828313112259,
395
+ "step": 230
396
+ },
397
+ {
398
+ "epoch": 1.5056,
399
+ "grad_norm": 0.380859375,
400
+ "learning_rate": 0.00017527597583490823,
401
+ "loss": 0.9121,
402
+ "mean_token_accuracy": 0.7678615897893906,
403
+ "step": 235
404
+ },
405
+ {
406
+ "epoch": 1.5375999999999999,
407
+ "grad_norm": 0.388671875,
408
+ "learning_rate": 0.00015446867550656773,
409
+ "loss": 1.0359,
410
+ "mean_token_accuracy": 0.7470418214797974,
411
+ "step": 240
412
+ },
413
+ {
414
+ "epoch": 1.5695999999999999,
415
+ "grad_norm": 0.416015625,
416
+ "learning_rate": 0.0001347485445593612,
417
+ "loss": 0.9479,
418
+ "mean_token_accuracy": 0.7594236731529236,
419
+ "step": 245
420
+ },
421
+ {
422
+ "epoch": 1.6016,
423
+ "grad_norm": 0.34765625,
424
+ "learning_rate": 0.00011617762982099444,
425
+ "loss": 0.9621,
426
+ "mean_token_accuracy": 0.7574791193008423,
427
+ "step": 250
428
+ },
429
+ {
430
+ "epoch": 1.6336,
431
+ "grad_norm": 0.4140625,
432
+ "learning_rate": 9.881436225981105e-05,
433
+ "loss": 0.9807,
434
+ "mean_token_accuracy": 0.7525162696838379,
435
+ "step": 255
436
+ },
437
+ {
438
+ "epoch": 1.6656,
439
+ "grad_norm": 0.431640625,
440
+ "learning_rate": 8.271337313934873e-05,
441
+ "loss": 0.9372,
442
+ "mean_token_accuracy": 0.7655486732721328,
443
+ "step": 260
444
+ },
445
+ {
446
+ "epoch": 1.6976,
447
+ "grad_norm": 0.40625,
448
+ "learning_rate": 6.792532212817271e-05,
449
+ "loss": 0.9755,
450
+ "mean_token_accuracy": 0.7566721349954605,
451
+ "step": 265
452
+ },
453
+ {
454
+ "epoch": 1.7296,
455
+ "grad_norm": 0.44921875,
456
+ "learning_rate": 5.449673790581611e-05,
457
+ "loss": 0.9346,
458
+ "mean_token_accuracy": 0.7655843138694763,
459
+ "step": 270
460
+ },
461
+ {
462
+ "epoch": 1.7616,
463
+ "grad_norm": 0.443359375,
464
+ "learning_rate": 4.2469871766340095e-05,
465
+ "loss": 1.0262,
466
+ "mean_token_accuracy": 0.748220956325531,
467
+ "step": 275
468
+ },
469
+ {
470
+ "epoch": 1.7936,
471
+ "grad_norm": 0.416015625,
472
+ "learning_rate": 3.18825646801314e-05,
473
+ "loss": 0.973,
474
+ "mean_token_accuracy": 0.7554513663053513,
475
+ "step": 280
476
+ },
477
+ {
478
+ "epoch": 1.8256000000000001,
479
+ "grad_norm": 0.36328125,
480
+ "learning_rate": 2.276812823220964e-05,
481
+ "loss": 0.9479,
482
+ "mean_token_accuracy": 0.7615785777568818,
483
+ "step": 285
484
+ },
485
+ {
486
+ "epoch": 1.8576000000000001,
487
+ "grad_norm": 0.390625,
488
+ "learning_rate": 1.5155239811656562e-05,
489
+ "loss": 0.9759,
490
+ "mean_token_accuracy": 0.7542947381734848,
491
+ "step": 290
492
+ },
493
+ {
494
+ "epoch": 1.8896,
495
+ "grad_norm": 0.41796875,
496
+ "learning_rate": 9.0678523819408e-06,
497
+ "loss": 0.9293,
498
+ "mean_token_accuracy": 0.7637231528759003,
499
+ "step": 295
500
+ },
501
+ {
502
+ "epoch": 1.9216,
503
+ "grad_norm": 0.416015625,
504
+ "learning_rate": 4.52511911603265e-06,
505
+ "loss": 0.9287,
506
+ "mean_token_accuracy": 0.7664623886346817,
507
+ "step": 300
508
+ },
509
+ {
510
+ "epoch": 1.9536,
511
+ "grad_norm": 0.357421875,
512
+ "learning_rate": 1.541333133436018e-06,
513
+ "loss": 0.995,
514
+ "mean_token_accuracy": 0.7533135890960694,
515
+ "step": 305
516
+ },
517
+ {
518
+ "epoch": 1.9856,
519
+ "grad_norm": 0.384765625,
520
+ "learning_rate": 1.2588252874673466e-07,
521
+ "loss": 0.9186,
522
+ "mean_token_accuracy": 0.7667679220438004,
523
+ "step": 310
524
+ },
525
+ {
526
+ "epoch": 1.9984,
527
+ "eval_loss": 1.0249439477920532,
528
+ "eval_mean_token_accuracy": 0.7480296109073846,
529
+ "eval_runtime": 74.4351,
530
+ "eval_samples_per_second": 13.435,
531
+ "eval_steps_per_second": 3.359,
532
+ "step": 312
533
+ },
534
+ {
535
+ "epoch": 1.9984,
536
+ "step": 312,
537
+ "total_flos": 6258600061401600.0,
538
+ "train_loss": 0.5569522526019659,
539
+ "train_runtime": 905.7572,
540
+ "train_samples_per_second": 5.52,
541
+ "train_steps_per_second": 0.344
542
+ }
543
+ ],
544
+ "logging_steps": 5,
545
+ "max_steps": 312,
546
+ "num_input_tokens_seen": 0,
547
+ "num_train_epochs": 2,
548
+ "save_steps": 500,
549
+ "stateful_callbacks": {
550
+ "TrainerControl": {
551
+ "args": {
552
+ "should_epoch_stop": false,
553
+ "should_evaluate": false,
554
+ "should_log": false,
555
+ "should_save": true,
556
+ "should_training_stop": false
557
+ },
558
+ "attributes": {}
559
+ }
560
+ },
561
+ "total_flos": 6258600061401600.0,
562
+ "train_batch_size": 4,
563
+ "trial_name": null,
564
+ "trial_params": null
565
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0fe305d962f97164f1015eae1856be72b8587a1a543d77a5451ce4b3139ab55
3
+ size 5624
vocab.json ADDED
The diff for this file is too large to render. See raw diff