File size: 5,308 Bytes
c6afd6e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
{
  "best_metric": 0.761252446183953,
  "best_model_checkpoint": "tiny-bert-sst2-distilled/run-6/checkpoint-837",
  "epoch": 9.0,
  "eval_steps": 500,
  "global_step": 837,
  "is_hyper_param_search": true,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 1.0,
      "grad_norm": 0.4286664128303528,
      "learning_rate": 1.945409831472016e-05,
      "loss": 0.4806,
      "step": 93
    },
    {
      "epoch": 1.0,
      "eval_accuracy": 0.5,
      "eval_f1": 0.0,
      "eval_loss": 0.4703535735607147,
      "eval_precision": 0.0,
      "eval_recall": 0.0,
      "eval_runtime": 28.42,
      "eval_samples_per_second": 35.961,
      "eval_steps_per_second": 1.126,
      "step": 93
    },
    {
      "epoch": 2.0,
      "grad_norm": 0.94158935546875,
      "learning_rate": 1.7022336025380143e-05,
      "loss": 0.4648,
      "step": 186
    },
    {
      "epoch": 2.0,
      "eval_accuracy": 0.5176125244618396,
      "eval_f1": 0.10849909584086799,
      "eval_loss": 0.4538751244544983,
      "eval_precision": 0.7142857142857143,
      "eval_recall": 0.05870841487279843,
      "eval_runtime": 28.3351,
      "eval_samples_per_second": 36.068,
      "eval_steps_per_second": 1.129,
      "step": 186
    },
    {
      "epoch": 3.0,
      "grad_norm": 1.0048439502716064,
      "learning_rate": 1.459057373604012e-05,
      "loss": 0.4406,
      "step": 279
    },
    {
      "epoch": 3.0,
      "eval_accuracy": 0.5929549902152642,
      "eval_f1": 0.3677811550151976,
      "eval_loss": 0.4149659276008606,
      "eval_precision": 0.8231292517006803,
      "eval_recall": 0.23679060665362034,
      "eval_runtime": 28.3925,
      "eval_samples_per_second": 35.995,
      "eval_steps_per_second": 1.127,
      "step": 279
    },
    {
      "epoch": 4.0,
      "grad_norm": 2.1001150608062744,
      "learning_rate": 1.2158811446700102e-05,
      "loss": 0.4126,
      "step": 372
    },
    {
      "epoch": 4.0,
      "eval_accuracy": 0.7025440313111546,
      "eval_f1": 0.6456876456876457,
      "eval_loss": 0.39196181297302246,
      "eval_precision": 0.7982708933717579,
      "eval_recall": 0.5420743639921722,
      "eval_runtime": 28.4943,
      "eval_samples_per_second": 35.867,
      "eval_steps_per_second": 1.123,
      "step": 372
    },
    {
      "epoch": 5.0,
      "grad_norm": 1.2996047735214233,
      "learning_rate": 9.72704915736008e-06,
      "loss": 0.4021,
      "step": 465
    },
    {
      "epoch": 5.0,
      "eval_accuracy": 0.735812133072407,
      "eval_f1": 0.7133757961783439,
      "eval_loss": 0.3851335346698761,
      "eval_precision": 0.7795823665893271,
      "eval_recall": 0.6575342465753424,
      "eval_runtime": 28.8344,
      "eval_samples_per_second": 35.444,
      "eval_steps_per_second": 1.11,
      "step": 465
    },
    {
      "epoch": 6.0,
      "grad_norm": 1.8318911790847778,
      "learning_rate": 7.29528686802006e-06,
      "loss": 0.3976,
      "step": 558
    },
    {
      "epoch": 6.0,
      "eval_accuracy": 0.7524461839530333,
      "eval_f1": 0.7394438722966015,
      "eval_loss": 0.3816056251525879,
      "eval_precision": 0.7804347826086957,
      "eval_recall": 0.7025440313111546,
      "eval_runtime": 28.629,
      "eval_samples_per_second": 35.698,
      "eval_steps_per_second": 1.118,
      "step": 558
    },
    {
      "epoch": 7.0,
      "grad_norm": 1.5164391994476318,
      "learning_rate": 4.86352457868004e-06,
      "loss": 0.3934,
      "step": 651
    },
    {
      "epoch": 7.0,
      "eval_accuracy": 0.7504892367906066,
      "eval_f1": 0.7357512953367875,
      "eval_loss": 0.37981584668159485,
      "eval_precision": 0.7819383259911894,
      "eval_recall": 0.6947162426614482,
      "eval_runtime": 29.4404,
      "eval_samples_per_second": 34.714,
      "eval_steps_per_second": 1.087,
      "step": 651
    },
    {
      "epoch": 8.0,
      "grad_norm": 1.8273214101791382,
      "learning_rate": 2.43176228934002e-06,
      "loss": 0.3903,
      "step": 744
    },
    {
      "epoch": 8.0,
      "eval_accuracy": 0.7544031311154599,
      "eval_f1": 0.7462082912032356,
      "eval_loss": 0.3790663480758667,
      "eval_precision": 0.7719665271966527,
      "eval_recall": 0.7221135029354208,
      "eval_runtime": 28.4026,
      "eval_samples_per_second": 35.983,
      "eval_steps_per_second": 1.127,
      "step": 744
    },
    {
      "epoch": 9.0,
      "grad_norm": 1.551561713218689,
      "learning_rate": 0.0,
      "loss": 0.3884,
      "step": 837
    },
    {
      "epoch": 9.0,
      "eval_accuracy": 0.761252446183953,
      "eval_f1": 0.7555110220440882,
      "eval_loss": 0.3788539469242096,
      "eval_precision": 0.7741273100616016,
      "eval_recall": 0.7377690802348337,
      "eval_runtime": 28.2118,
      "eval_samples_per_second": 36.226,
      "eval_steps_per_second": 1.134,
      "step": 837
    }
  ],
  "logging_steps": 500,
  "max_steps": 837,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 9,
  "save_steps": 500,
  "total_flos": 2121256775520.0,
  "train_batch_size": 33,
  "trial_name": null,
  "trial_params": {
    "alpha": 0.6122687021783514,
    "learning_rate": 2.188586060406018e-05,
    "num_train_epochs": 9,
    "per_device_train_batch_size": 33,
    "temperature": 14
  }
}