diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5079 @@ +{ + "best_metric": 0.8791208791208791, + "best_model_checkpoint": "vit-msn-small-lateral_flow_ivalidation_train_test_6/checkpoint-318", + "epoch": 92.3076923076923, + "eval_steps": 500, + "global_step": 600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.15384615384615385, + "grad_norm": 8.29874038696289, + "learning_rate": 2.7777777777777776e-09, + "loss": 0.6813, + "step": 1 + }, + { + "epoch": 0.3076923076923077, + "grad_norm": 14.01459789276123, + "learning_rate": 5.555555555555555e-09, + "loss": 0.6845, + "step": 2 + }, + { + "epoch": 0.46153846153846156, + "grad_norm": 12.911795616149902, + "learning_rate": 8.333333333333334e-09, + "loss": 0.6836, + "step": 3 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 6.274200439453125, + "learning_rate": 1.111111111111111e-08, + "loss": 0.6521, + "step": 4 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 8.031621932983398, + "learning_rate": 1.3888888888888887e-08, + "loss": 0.6781, + "step": 5 + }, + { + "epoch": 0.9230769230769231, + "grad_norm": 7.054372787475586, + "learning_rate": 1.6666666666666667e-08, + "loss": 0.6672, + "step": 6 + }, + { + "epoch": 0.9230769230769231, + "eval_accuracy": 0.42124542124542125, + "eval_loss": 0.6979768872261047, + "eval_runtime": 1.1041, + "eval_samples_per_second": 247.254, + "eval_steps_per_second": 4.528, + "step": 6 + }, + { + "epoch": 1.0769230769230769, + "grad_norm": 7.790348529815674, + "learning_rate": 1.9444444444444445e-08, + "loss": 0.6725, + "step": 7 + }, + { + "epoch": 1.2307692307692308, + "grad_norm": 13.414072036743164, + "learning_rate": 2.222222222222222e-08, + "loss": 0.6652, + "step": 8 + }, + { + "epoch": 1.3846153846153846, + "grad_norm": 8.81810474395752, + "learning_rate": 2.5e-08, + "loss": 0.6698, + "step": 9 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 7.751201152801514, + "learning_rate": 2.7777777777777774e-08, + "loss": 0.6817, + "step": 10 + }, + { + "epoch": 1.6923076923076923, + "grad_norm": 8.985770225524902, + "learning_rate": 3.0555555555555556e-08, + "loss": 0.6661, + "step": 11 + }, + { + "epoch": 1.8461538461538463, + "grad_norm": 6.888631820678711, + "learning_rate": 3.3333333333333334e-08, + "loss": 0.6712, + "step": 12 + }, + { + "epoch": 2.0, + "grad_norm": 8.5791597366333, + "learning_rate": 3.6111111111111106e-08, + "loss": 0.6617, + "step": 13 + }, + { + "epoch": 2.0, + "eval_accuracy": 0.4249084249084249, + "eval_loss": 0.6965357065200806, + "eval_runtime": 1.1521, + "eval_samples_per_second": 236.95, + "eval_steps_per_second": 4.34, + "step": 13 + }, + { + "epoch": 2.1538461538461537, + "grad_norm": 9.721835136413574, + "learning_rate": 3.888888888888889e-08, + "loss": 0.665, + "step": 14 + }, + { + "epoch": 2.3076923076923075, + "grad_norm": 10.525633811950684, + "learning_rate": 4.166666666666666e-08, + "loss": 0.6629, + "step": 15 + }, + { + "epoch": 2.4615384615384617, + "grad_norm": 5.860161304473877, + "learning_rate": 4.444444444444444e-08, + "loss": 0.686, + "step": 16 + }, + { + "epoch": 2.6153846153846154, + "grad_norm": 7.980571269989014, + "learning_rate": 4.722222222222222e-08, + "loss": 0.6638, + "step": 17 + }, + { + "epoch": 2.769230769230769, + "grad_norm": 6.91188383102417, + "learning_rate": 5e-08, + "loss": 0.6586, + "step": 18 + }, + { + "epoch": 2.9230769230769234, + "grad_norm": 8.952598571777344, + "learning_rate": 5.2777777777777776e-08, + "loss": 0.6699, + "step": 19 + }, + { + "epoch": 2.9230769230769234, + "eval_accuracy": 0.43956043956043955, + "eval_loss": 0.6944313049316406, + "eval_runtime": 1.1568, + "eval_samples_per_second": 235.989, + "eval_steps_per_second": 4.322, + "step": 19 + }, + { + "epoch": 3.076923076923077, + "grad_norm": 16.572885513305664, + "learning_rate": 5.555555555555555e-08, + "loss": 0.6761, + "step": 20 + }, + { + "epoch": 3.230769230769231, + "grad_norm": 10.379450798034668, + "learning_rate": 5.833333333333333e-08, + "loss": 0.6757, + "step": 21 + }, + { + "epoch": 3.3846153846153846, + "grad_norm": 8.093114852905273, + "learning_rate": 6.111111111111111e-08, + "loss": 0.6572, + "step": 22 + }, + { + "epoch": 3.5384615384615383, + "grad_norm": 12.009868621826172, + "learning_rate": 6.388888888888888e-08, + "loss": 0.6781, + "step": 23 + }, + { + "epoch": 3.6923076923076925, + "grad_norm": 6.616479396820068, + "learning_rate": 6.666666666666667e-08, + "loss": 0.6519, + "step": 24 + }, + { + "epoch": 3.8461538461538463, + "grad_norm": 4.752511024475098, + "learning_rate": 6.944444444444444e-08, + "loss": 0.669, + "step": 25 + }, + { + "epoch": 4.0, + "grad_norm": 6.52732515335083, + "learning_rate": 7.222222222222221e-08, + "loss": 0.662, + "step": 26 + }, + { + "epoch": 4.0, + "eval_accuracy": 0.43956043956043955, + "eval_loss": 0.6909541487693787, + "eval_runtime": 1.1432, + "eval_samples_per_second": 238.799, + "eval_steps_per_second": 4.374, + "step": 26 + }, + { + "epoch": 4.153846153846154, + "grad_norm": 10.048100471496582, + "learning_rate": 7.5e-08, + "loss": 0.6616, + "step": 27 + }, + { + "epoch": 4.3076923076923075, + "grad_norm": 9.803552627563477, + "learning_rate": 7.777777777777778e-08, + "loss": 0.672, + "step": 28 + }, + { + "epoch": 4.461538461538462, + "grad_norm": 4.989064693450928, + "learning_rate": 8.055555555555555e-08, + "loss": 0.6615, + "step": 29 + }, + { + "epoch": 4.615384615384615, + "grad_norm": 9.584320068359375, + "learning_rate": 8.333333333333333e-08, + "loss": 0.6534, + "step": 30 + }, + { + "epoch": 4.769230769230769, + "grad_norm": 8.697301864624023, + "learning_rate": 8.611111111111111e-08, + "loss": 0.6592, + "step": 31 + }, + { + "epoch": 4.923076923076923, + "grad_norm": 6.305507183074951, + "learning_rate": 8.888888888888888e-08, + "loss": 0.6548, + "step": 32 + }, + { + "epoch": 4.923076923076923, + "eval_accuracy": 0.45787545787545786, + "eval_loss": 0.6873242855072021, + "eval_runtime": 1.0879, + "eval_samples_per_second": 250.944, + "eval_steps_per_second": 4.596, + "step": 32 + }, + { + "epoch": 5.076923076923077, + "grad_norm": 8.161693572998047, + "learning_rate": 9.166666666666665e-08, + "loss": 0.6463, + "step": 33 + }, + { + "epoch": 5.230769230769231, + "grad_norm": 5.462063789367676, + "learning_rate": 9.444444444444444e-08, + "loss": 0.6532, + "step": 34 + }, + { + "epoch": 5.384615384615385, + "grad_norm": 7.397779941558838, + "learning_rate": 9.722222222222222e-08, + "loss": 0.6447, + "step": 35 + }, + { + "epoch": 5.538461538461538, + "grad_norm": 9.939643859863281, + "learning_rate": 1e-07, + "loss": 0.6484, + "step": 36 + }, + { + "epoch": 5.6923076923076925, + "grad_norm": 6.816234111785889, + "learning_rate": 1.0277777777777777e-07, + "loss": 0.6558, + "step": 37 + }, + { + "epoch": 5.846153846153846, + "grad_norm": 11.238946914672852, + "learning_rate": 1.0555555555555555e-07, + "loss": 0.6669, + "step": 38 + }, + { + "epoch": 6.0, + "grad_norm": 8.005824089050293, + "learning_rate": 1.0833333333333334e-07, + "loss": 0.6541, + "step": 39 + }, + { + "epoch": 6.0, + "eval_accuracy": 0.4835164835164835, + "eval_loss": 0.6825475692749023, + "eval_runtime": 1.1582, + "eval_samples_per_second": 235.72, + "eval_steps_per_second": 4.317, + "step": 39 + }, + { + "epoch": 6.153846153846154, + "grad_norm": 6.5383524894714355, + "learning_rate": 1.111111111111111e-07, + "loss": 0.6317, + "step": 40 + }, + { + "epoch": 6.3076923076923075, + "grad_norm": 4.98549747467041, + "learning_rate": 1.1388888888888888e-07, + "loss": 0.6609, + "step": 41 + }, + { + "epoch": 6.461538461538462, + "grad_norm": 6.8932037353515625, + "learning_rate": 1.1666666666666667e-07, + "loss": 0.6459, + "step": 42 + }, + { + "epoch": 6.615384615384615, + "grad_norm": 6.804798126220703, + "learning_rate": 1.1944444444444445e-07, + "loss": 0.6628, + "step": 43 + }, + { + "epoch": 6.769230769230769, + "grad_norm": 7.5995707511901855, + "learning_rate": 1.2222222222222222e-07, + "loss": 0.6501, + "step": 44 + }, + { + "epoch": 6.923076923076923, + "grad_norm": 10.426745414733887, + "learning_rate": 1.25e-07, + "loss": 0.6222, + "step": 45 + }, + { + "epoch": 6.923076923076923, + "eval_accuracy": 0.5311355311355311, + "eval_loss": 0.6776841282844543, + "eval_runtime": 1.1748, + "eval_samples_per_second": 232.383, + "eval_steps_per_second": 4.256, + "step": 45 + }, + { + "epoch": 7.076923076923077, + "grad_norm": 6.766184329986572, + "learning_rate": 1.2777777777777777e-07, + "loss": 0.6452, + "step": 46 + }, + { + "epoch": 7.230769230769231, + "grad_norm": 10.556381225585938, + "learning_rate": 1.3055555555555556e-07, + "loss": 0.6157, + "step": 47 + }, + { + "epoch": 7.384615384615385, + "grad_norm": 9.157587051391602, + "learning_rate": 1.3333333333333334e-07, + "loss": 0.6313, + "step": 48 + }, + { + "epoch": 7.538461538461538, + "grad_norm": 6.186281204223633, + "learning_rate": 1.3611111111111108e-07, + "loss": 0.6773, + "step": 49 + }, + { + "epoch": 7.6923076923076925, + "grad_norm": 5.614987373352051, + "learning_rate": 1.3888888888888888e-07, + "loss": 0.6474, + "step": 50 + }, + { + "epoch": 7.846153846153846, + "grad_norm": 5.66176700592041, + "learning_rate": 1.4166666666666665e-07, + "loss": 0.6463, + "step": 51 + }, + { + "epoch": 8.0, + "grad_norm": 4.733225345611572, + "learning_rate": 1.4444444444444442e-07, + "loss": 0.6555, + "step": 52 + }, + { + "epoch": 8.0, + "eval_accuracy": 0.5421245421245421, + "eval_loss": 0.6718742251396179, + "eval_runtime": 1.1693, + "eval_samples_per_second": 233.475, + "eval_steps_per_second": 4.276, + "step": 52 + }, + { + "epoch": 8.153846153846153, + "grad_norm": 5.067059516906738, + "learning_rate": 1.4722222222222222e-07, + "loss": 0.6388, + "step": 53 + }, + { + "epoch": 8.307692307692308, + "grad_norm": 7.690587520599365, + "learning_rate": 1.5e-07, + "loss": 0.6139, + "step": 54 + }, + { + "epoch": 8.461538461538462, + "grad_norm": 4.471611976623535, + "learning_rate": 1.527777777777778e-07, + "loss": 0.64, + "step": 55 + }, + { + "epoch": 8.615384615384615, + "grad_norm": 6.964099884033203, + "learning_rate": 1.5555555555555556e-07, + "loss": 0.6399, + "step": 56 + }, + { + "epoch": 8.76923076923077, + "grad_norm": 4.763670444488525, + "learning_rate": 1.583333333333333e-07, + "loss": 0.6176, + "step": 57 + }, + { + "epoch": 8.923076923076923, + "grad_norm": 7.8895063400268555, + "learning_rate": 1.611111111111111e-07, + "loss": 0.6226, + "step": 58 + }, + { + "epoch": 8.923076923076923, + "eval_accuracy": 0.5860805860805861, + "eval_loss": 0.6665313839912415, + "eval_runtime": 1.1415, + "eval_samples_per_second": 239.164, + "eval_steps_per_second": 4.38, + "step": 58 + }, + { + "epoch": 9.076923076923077, + "grad_norm": 5.872027397155762, + "learning_rate": 1.6388888888888888e-07, + "loss": 0.6097, + "step": 59 + }, + { + "epoch": 9.23076923076923, + "grad_norm": 4.506241798400879, + "learning_rate": 1.6666666666666665e-07, + "loss": 0.6481, + "step": 60 + }, + { + "epoch": 9.384615384615385, + "grad_norm": 6.888943672180176, + "learning_rate": 1.6944444444444445e-07, + "loss": 0.6135, + "step": 61 + }, + { + "epoch": 9.538461538461538, + "grad_norm": 5.079667568206787, + "learning_rate": 1.7222222222222222e-07, + "loss": 0.6268, + "step": 62 + }, + { + "epoch": 9.692307692307692, + "grad_norm": 5.653709411621094, + "learning_rate": 1.75e-07, + "loss": 0.6259, + "step": 63 + }, + { + "epoch": 9.846153846153847, + "grad_norm": 5.669986724853516, + "learning_rate": 1.7777777777777776e-07, + "loss": 0.6195, + "step": 64 + }, + { + "epoch": 10.0, + "grad_norm": 5.195782661437988, + "learning_rate": 1.8055555555555554e-07, + "loss": 0.5989, + "step": 65 + }, + { + "epoch": 10.0, + "eval_accuracy": 0.6153846153846154, + "eval_loss": 0.6603101491928101, + "eval_runtime": 1.1943, + "eval_samples_per_second": 228.592, + "eval_steps_per_second": 4.187, + "step": 65 + }, + { + "epoch": 10.153846153846153, + "grad_norm": 5.164191722869873, + "learning_rate": 1.833333333333333e-07, + "loss": 0.6329, + "step": 66 + }, + { + "epoch": 10.307692307692308, + "grad_norm": 5.711167812347412, + "learning_rate": 1.861111111111111e-07, + "loss": 0.6198, + "step": 67 + }, + { + "epoch": 10.461538461538462, + "grad_norm": 5.415841102600098, + "learning_rate": 1.8888888888888888e-07, + "loss": 0.6162, + "step": 68 + }, + { + "epoch": 10.615384615384615, + "grad_norm": 5.584977626800537, + "learning_rate": 1.9166666666666668e-07, + "loss": 0.626, + "step": 69 + }, + { + "epoch": 10.76923076923077, + "grad_norm": 4.149146556854248, + "learning_rate": 1.9444444444444445e-07, + "loss": 0.6246, + "step": 70 + }, + { + "epoch": 10.923076923076923, + "grad_norm": 8.436751365661621, + "learning_rate": 1.9722222222222222e-07, + "loss": 0.5754, + "step": 71 + }, + { + "epoch": 10.923076923076923, + "eval_accuracy": 0.6263736263736264, + "eval_loss": 0.6555379629135132, + "eval_runtime": 1.1482, + "eval_samples_per_second": 237.759, + "eval_steps_per_second": 4.355, + "step": 71 + }, + { + "epoch": 11.076923076923077, + "grad_norm": 5.09835147857666, + "learning_rate": 2e-07, + "loss": 0.6153, + "step": 72 + }, + { + "epoch": 11.23076923076923, + "grad_norm": 6.399817943572998, + "learning_rate": 2.0277777777777776e-07, + "loss": 0.6001, + "step": 73 + }, + { + "epoch": 11.384615384615385, + "grad_norm": 4.717789173126221, + "learning_rate": 2.0555555555555553e-07, + "loss": 0.6009, + "step": 74 + }, + { + "epoch": 11.538461538461538, + "grad_norm": 5.399771213531494, + "learning_rate": 2.0833333333333333e-07, + "loss": 0.5998, + "step": 75 + }, + { + "epoch": 11.692307692307692, + "grad_norm": 5.429381370544434, + "learning_rate": 2.111111111111111e-07, + "loss": 0.5892, + "step": 76 + }, + { + "epoch": 11.846153846153847, + "grad_norm": 4.105190277099609, + "learning_rate": 2.1388888888888888e-07, + "loss": 0.6111, + "step": 77 + }, + { + "epoch": 12.0, + "grad_norm": 6.455173969268799, + "learning_rate": 2.1666666666666667e-07, + "loss": 0.6251, + "step": 78 + }, + { + "epoch": 12.0, + "eval_accuracy": 0.6483516483516484, + "eval_loss": 0.6492875218391418, + "eval_runtime": 1.1131, + "eval_samples_per_second": 245.251, + "eval_steps_per_second": 4.492, + "step": 78 + }, + { + "epoch": 12.153846153846153, + "grad_norm": 6.023905277252197, + "learning_rate": 2.1944444444444442e-07, + "loss": 0.6211, + "step": 79 + }, + { + "epoch": 12.307692307692308, + "grad_norm": 4.803109645843506, + "learning_rate": 2.222222222222222e-07, + "loss": 0.5972, + "step": 80 + }, + { + "epoch": 12.461538461538462, + "grad_norm": 4.598735332489014, + "learning_rate": 2.25e-07, + "loss": 0.5978, + "step": 81 + }, + { + "epoch": 12.615384615384615, + "grad_norm": 5.137476444244385, + "learning_rate": 2.2777777777777776e-07, + "loss": 0.5878, + "step": 82 + }, + { + "epoch": 12.76923076923077, + "grad_norm": 5.255553245544434, + "learning_rate": 2.3055555555555556e-07, + "loss": 0.59, + "step": 83 + }, + { + "epoch": 12.923076923076923, + "grad_norm": 4.83677864074707, + "learning_rate": 2.3333333333333333e-07, + "loss": 0.5796, + "step": 84 + }, + { + "epoch": 12.923076923076923, + "eval_accuracy": 0.6666666666666666, + "eval_loss": 0.6446050405502319, + "eval_runtime": 1.1065, + "eval_samples_per_second": 246.726, + "eval_steps_per_second": 4.519, + "step": 84 + }, + { + "epoch": 13.076923076923077, + "grad_norm": 5.2046403884887695, + "learning_rate": 2.361111111111111e-07, + "loss": 0.5808, + "step": 85 + }, + { + "epoch": 13.23076923076923, + "grad_norm": 5.6977057456970215, + "learning_rate": 2.388888888888889e-07, + "loss": 0.5921, + "step": 86 + }, + { + "epoch": 13.384615384615385, + "grad_norm": 4.75449800491333, + "learning_rate": 2.4166666666666665e-07, + "loss": 0.576, + "step": 87 + }, + { + "epoch": 13.538461538461538, + "grad_norm": 4.761056423187256, + "learning_rate": 2.4444444444444445e-07, + "loss": 0.6126, + "step": 88 + }, + { + "epoch": 13.692307692307692, + "grad_norm": 4.913057327270508, + "learning_rate": 2.4722222222222224e-07, + "loss": 0.5814, + "step": 89 + }, + { + "epoch": 13.846153846153847, + "grad_norm": 7.290613651275635, + "learning_rate": 2.5e-07, + "loss": 0.5803, + "step": 90 + }, + { + "epoch": 14.0, + "grad_norm": 6.033799171447754, + "learning_rate": 2.5277777777777773e-07, + "loss": 0.5763, + "step": 91 + }, + { + "epoch": 14.0, + "eval_accuracy": 0.6666666666666666, + "eval_loss": 0.6390318274497986, + "eval_runtime": 1.1176, + "eval_samples_per_second": 244.273, + "eval_steps_per_second": 4.474, + "step": 91 + }, + { + "epoch": 14.153846153846153, + "grad_norm": 5.3400678634643555, + "learning_rate": 2.5555555555555553e-07, + "loss": 0.5817, + "step": 92 + }, + { + "epoch": 14.307692307692308, + "grad_norm": 3.9350779056549072, + "learning_rate": 2.5833333333333333e-07, + "loss": 0.588, + "step": 93 + }, + { + "epoch": 14.461538461538462, + "grad_norm": 4.339548110961914, + "learning_rate": 2.6111111111111113e-07, + "loss": 0.5964, + "step": 94 + }, + { + "epoch": 14.615384615384615, + "grad_norm": 5.889684677124023, + "learning_rate": 2.638888888888889e-07, + "loss": 0.5636, + "step": 95 + }, + { + "epoch": 14.76923076923077, + "grad_norm": 4.385285377502441, + "learning_rate": 2.6666666666666667e-07, + "loss": 0.5898, + "step": 96 + }, + { + "epoch": 14.923076923076923, + "grad_norm": 7.42651891708374, + "learning_rate": 2.694444444444444e-07, + "loss": 0.5952, + "step": 97 + }, + { + "epoch": 14.923076923076923, + "eval_accuracy": 0.684981684981685, + "eval_loss": 0.6333425045013428, + "eval_runtime": 1.0931, + "eval_samples_per_second": 249.747, + "eval_steps_per_second": 4.574, + "step": 97 + }, + { + "epoch": 15.076923076923077, + "grad_norm": 7.099740028381348, + "learning_rate": 2.7222222222222216e-07, + "loss": 0.5412, + "step": 98 + }, + { + "epoch": 15.23076923076923, + "grad_norm": 5.428436279296875, + "learning_rate": 2.75e-07, + "loss": 0.5645, + "step": 99 + }, + { + "epoch": 15.384615384615385, + "grad_norm": 5.187060832977295, + "learning_rate": 2.7777777777777776e-07, + "loss": 0.5795, + "step": 100 + }, + { + "epoch": 15.538461538461538, + "grad_norm": 4.902520656585693, + "learning_rate": 2.8055555555555556e-07, + "loss": 0.5459, + "step": 101 + }, + { + "epoch": 15.692307692307692, + "grad_norm": 4.782113075256348, + "learning_rate": 2.833333333333333e-07, + "loss": 0.577, + "step": 102 + }, + { + "epoch": 15.846153846153847, + "grad_norm": 6.622628688812256, + "learning_rate": 2.861111111111111e-07, + "loss": 0.5583, + "step": 103 + }, + { + "epoch": 16.0, + "grad_norm": 4.653256416320801, + "learning_rate": 2.8888888888888885e-07, + "loss": 0.5675, + "step": 104 + }, + { + "epoch": 16.0, + "eval_accuracy": 0.7032967032967034, + "eval_loss": 0.6269450783729553, + "eval_runtime": 1.2331, + "eval_samples_per_second": 221.389, + "eval_steps_per_second": 4.055, + "step": 104 + }, + { + "epoch": 16.153846153846153, + "grad_norm": 5.335786819458008, + "learning_rate": 2.916666666666667e-07, + "loss": 0.5492, + "step": 105 + }, + { + "epoch": 16.307692307692307, + "grad_norm": 6.031639575958252, + "learning_rate": 2.9444444444444444e-07, + "loss": 0.5475, + "step": 106 + }, + { + "epoch": 16.46153846153846, + "grad_norm": 8.427881240844727, + "learning_rate": 2.972222222222222e-07, + "loss": 0.5995, + "step": 107 + }, + { + "epoch": 16.615384615384617, + "grad_norm": 4.41181755065918, + "learning_rate": 3e-07, + "loss": 0.5458, + "step": 108 + }, + { + "epoch": 16.76923076923077, + "grad_norm": 5.0706257820129395, + "learning_rate": 3.0277777777777773e-07, + "loss": 0.5739, + "step": 109 + }, + { + "epoch": 16.923076923076923, + "grad_norm": 5.834512233734131, + "learning_rate": 3.055555555555556e-07, + "loss": 0.5453, + "step": 110 + }, + { + "epoch": 16.923076923076923, + "eval_accuracy": 0.7106227106227107, + "eval_loss": 0.6210848689079285, + "eval_runtime": 1.1361, + "eval_samples_per_second": 240.302, + "eval_steps_per_second": 4.401, + "step": 110 + }, + { + "epoch": 17.076923076923077, + "grad_norm": 4.99503231048584, + "learning_rate": 3.0833333333333333e-07, + "loss": 0.5517, + "step": 111 + }, + { + "epoch": 17.23076923076923, + "grad_norm": 4.0909833908081055, + "learning_rate": 3.111111111111111e-07, + "loss": 0.5664, + "step": 112 + }, + { + "epoch": 17.384615384615383, + "grad_norm": 4.5490851402282715, + "learning_rate": 3.1388888888888887e-07, + "loss": 0.541, + "step": 113 + }, + { + "epoch": 17.53846153846154, + "grad_norm": 4.016046524047852, + "learning_rate": 3.166666666666666e-07, + "loss": 0.5788, + "step": 114 + }, + { + "epoch": 17.692307692307693, + "grad_norm": 5.587233066558838, + "learning_rate": 3.194444444444444e-07, + "loss": 0.5155, + "step": 115 + }, + { + "epoch": 17.846153846153847, + "grad_norm": 5.020046234130859, + "learning_rate": 3.222222222222222e-07, + "loss": 0.5386, + "step": 116 + }, + { + "epoch": 18.0, + "grad_norm": 6.417337417602539, + "learning_rate": 3.25e-07, + "loss": 0.5199, + "step": 117 + }, + { + "epoch": 18.0, + "eval_accuracy": 0.7142857142857143, + "eval_loss": 0.6150110363960266, + "eval_runtime": 1.1307, + "eval_samples_per_second": 241.446, + "eval_steps_per_second": 4.422, + "step": 117 + }, + { + "epoch": 18.153846153846153, + "grad_norm": 5.794677257537842, + "learning_rate": 3.2777777777777776e-07, + "loss": 0.5343, + "step": 118 + }, + { + "epoch": 18.307692307692307, + "grad_norm": 4.628763198852539, + "learning_rate": 3.3055555555555556e-07, + "loss": 0.5275, + "step": 119 + }, + { + "epoch": 18.46153846153846, + "grad_norm": 7.958808422088623, + "learning_rate": 3.333333333333333e-07, + "loss": 0.5252, + "step": 120 + }, + { + "epoch": 18.615384615384617, + "grad_norm": 4.4327826499938965, + "learning_rate": 3.361111111111111e-07, + "loss": 0.5233, + "step": 121 + }, + { + "epoch": 18.76923076923077, + "grad_norm": 5.422006130218506, + "learning_rate": 3.388888888888889e-07, + "loss": 0.5646, + "step": 122 + }, + { + "epoch": 18.923076923076923, + "grad_norm": 4.5244975090026855, + "learning_rate": 3.4166666666666664e-07, + "loss": 0.541, + "step": 123 + }, + { + "epoch": 18.923076923076923, + "eval_accuracy": 0.7216117216117216, + "eval_loss": 0.6090343594551086, + "eval_runtime": 1.2237, + "eval_samples_per_second": 223.1, + "eval_steps_per_second": 4.086, + "step": 123 + }, + { + "epoch": 19.076923076923077, + "grad_norm": 6.068373680114746, + "learning_rate": 3.4444444444444444e-07, + "loss": 0.5179, + "step": 124 + }, + { + "epoch": 19.23076923076923, + "grad_norm": 4.209527492523193, + "learning_rate": 3.472222222222222e-07, + "loss": 0.5533, + "step": 125 + }, + { + "epoch": 19.384615384615383, + "grad_norm": 5.68998384475708, + "learning_rate": 3.5e-07, + "loss": 0.5219, + "step": 126 + }, + { + "epoch": 19.53846153846154, + "grad_norm": 5.829248428344727, + "learning_rate": 3.527777777777778e-07, + "loss": 0.503, + "step": 127 + }, + { + "epoch": 19.692307692307693, + "grad_norm": 4.40165376663208, + "learning_rate": 3.5555555555555553e-07, + "loss": 0.5483, + "step": 128 + }, + { + "epoch": 19.846153846153847, + "grad_norm": 4.628623962402344, + "learning_rate": 3.583333333333333e-07, + "loss": 0.5249, + "step": 129 + }, + { + "epoch": 20.0, + "grad_norm": 4.771780014038086, + "learning_rate": 3.6111111111111107e-07, + "loss": 0.5273, + "step": 130 + }, + { + "epoch": 20.0, + "eval_accuracy": 0.7289377289377289, + "eval_loss": 0.6007034182548523, + "eval_runtime": 1.1281, + "eval_samples_per_second": 241.989, + "eval_steps_per_second": 4.432, + "step": 130 + }, + { + "epoch": 20.153846153846153, + "grad_norm": 4.505945682525635, + "learning_rate": 3.6388888888888887e-07, + "loss": 0.515, + "step": 131 + }, + { + "epoch": 20.307692307692307, + "grad_norm": 7.482431411743164, + "learning_rate": 3.666666666666666e-07, + "loss": 0.4948, + "step": 132 + }, + { + "epoch": 20.46153846153846, + "grad_norm": 4.583889007568359, + "learning_rate": 3.6944444444444447e-07, + "loss": 0.5215, + "step": 133 + }, + { + "epoch": 20.615384615384617, + "grad_norm": 5.050055980682373, + "learning_rate": 3.722222222222222e-07, + "loss": 0.502, + "step": 134 + }, + { + "epoch": 20.76923076923077, + "grad_norm": 5.0981245040893555, + "learning_rate": 3.75e-07, + "loss": 0.5343, + "step": 135 + }, + { + "epoch": 20.923076923076923, + "grad_norm": 7.6524529457092285, + "learning_rate": 3.7777777777777775e-07, + "loss": 0.495, + "step": 136 + }, + { + "epoch": 20.923076923076923, + "eval_accuracy": 0.7289377289377289, + "eval_loss": 0.5934087634086609, + "eval_runtime": 1.1036, + "eval_samples_per_second": 247.368, + "eval_steps_per_second": 4.531, + "step": 136 + }, + { + "epoch": 21.076923076923077, + "grad_norm": 5.747613906860352, + "learning_rate": 3.805555555555555e-07, + "loss": 0.4929, + "step": 137 + }, + { + "epoch": 21.23076923076923, + "grad_norm": 8.28075122833252, + "learning_rate": 3.8333333333333335e-07, + "loss": 0.5302, + "step": 138 + }, + { + "epoch": 21.384615384615383, + "grad_norm": 5.376802921295166, + "learning_rate": 3.861111111111111e-07, + "loss": 0.547, + "step": 139 + }, + { + "epoch": 21.53846153846154, + "grad_norm": 4.9347968101501465, + "learning_rate": 3.888888888888889e-07, + "loss": 0.4807, + "step": 140 + }, + { + "epoch": 21.692307692307693, + "grad_norm": 6.574237823486328, + "learning_rate": 3.9166666666666664e-07, + "loss": 0.4839, + "step": 141 + }, + { + "epoch": 21.846153846153847, + "grad_norm": 4.990074157714844, + "learning_rate": 3.9444444444444444e-07, + "loss": 0.4937, + "step": 142 + }, + { + "epoch": 22.0, + "grad_norm": 6.261688232421875, + "learning_rate": 3.972222222222222e-07, + "loss": 0.4855, + "step": 143 + }, + { + "epoch": 22.0, + "eval_accuracy": 0.7472527472527473, + "eval_loss": 0.5855301022529602, + "eval_runtime": 1.1327, + "eval_samples_per_second": 241.009, + "eval_steps_per_second": 4.414, + "step": 143 + }, + { + "epoch": 22.153846153846153, + "grad_norm": 5.447592258453369, + "learning_rate": 4e-07, + "loss": 0.4225, + "step": 144 + }, + { + "epoch": 22.307692307692307, + "grad_norm": 4.675624847412109, + "learning_rate": 4.027777777777778e-07, + "loss": 0.4982, + "step": 145 + }, + { + "epoch": 22.46153846153846, + "grad_norm": 4.06153678894043, + "learning_rate": 4.055555555555555e-07, + "loss": 0.4751, + "step": 146 + }, + { + "epoch": 22.615384615384617, + "grad_norm": 7.109158039093018, + "learning_rate": 4.083333333333333e-07, + "loss": 0.4232, + "step": 147 + }, + { + "epoch": 22.76923076923077, + "grad_norm": 7.506172180175781, + "learning_rate": 4.1111111111111107e-07, + "loss": 0.4935, + "step": 148 + }, + { + "epoch": 22.923076923076923, + "grad_norm": 4.346311092376709, + "learning_rate": 4.1388888888888887e-07, + "loss": 0.4763, + "step": 149 + }, + { + "epoch": 22.923076923076923, + "eval_accuracy": 0.7362637362637363, + "eval_loss": 0.5787296295166016, + "eval_runtime": 1.163, + "eval_samples_per_second": 234.74, + "eval_steps_per_second": 4.299, + "step": 149 + }, + { + "epoch": 23.076923076923077, + "grad_norm": 7.535876750946045, + "learning_rate": 4.1666666666666667e-07, + "loss": 0.4539, + "step": 150 + }, + { + "epoch": 23.23076923076923, + "grad_norm": 4.603224277496338, + "learning_rate": 4.194444444444444e-07, + "loss": 0.491, + "step": 151 + }, + { + "epoch": 23.384615384615383, + "grad_norm": 4.099734783172607, + "learning_rate": 4.222222222222222e-07, + "loss": 0.4877, + "step": 152 + }, + { + "epoch": 23.53846153846154, + "grad_norm": 6.270420551300049, + "learning_rate": 4.2499999999999995e-07, + "loss": 0.4791, + "step": 153 + }, + { + "epoch": 23.692307692307693, + "grad_norm": 5.5756330490112305, + "learning_rate": 4.2777777777777775e-07, + "loss": 0.4192, + "step": 154 + }, + { + "epoch": 23.846153846153847, + "grad_norm": 4.492015838623047, + "learning_rate": 4.3055555555555555e-07, + "loss": 0.429, + "step": 155 + }, + { + "epoch": 24.0, + "grad_norm": 5.119944095611572, + "learning_rate": 4.3333333333333335e-07, + "loss": 0.4287, + "step": 156 + }, + { + "epoch": 24.0, + "eval_accuracy": 0.7509157509157509, + "eval_loss": 0.5693350434303284, + "eval_runtime": 1.2614, + "eval_samples_per_second": 216.434, + "eval_steps_per_second": 3.964, + "step": 156 + }, + { + "epoch": 24.153846153846153, + "grad_norm": 6.68319034576416, + "learning_rate": 4.361111111111111e-07, + "loss": 0.4549, + "step": 157 + }, + { + "epoch": 24.307692307692307, + "grad_norm": 6.93166446685791, + "learning_rate": 4.3888888888888884e-07, + "loss": 0.4878, + "step": 158 + }, + { + "epoch": 24.46153846153846, + "grad_norm": 4.761252403259277, + "learning_rate": 4.4166666666666664e-07, + "loss": 0.463, + "step": 159 + }, + { + "epoch": 24.615384615384617, + "grad_norm": 5.771098613739014, + "learning_rate": 4.444444444444444e-07, + "loss": 0.4418, + "step": 160 + }, + { + "epoch": 24.76923076923077, + "grad_norm": 5.692720413208008, + "learning_rate": 4.4722222222222223e-07, + "loss": 0.4211, + "step": 161 + }, + { + "epoch": 24.923076923076923, + "grad_norm": 10.276296615600586, + "learning_rate": 4.5e-07, + "loss": 0.445, + "step": 162 + }, + { + "epoch": 24.923076923076923, + "eval_accuracy": 0.7692307692307693, + "eval_loss": 0.5618556141853333, + "eval_runtime": 1.2168, + "eval_samples_per_second": 224.355, + "eval_steps_per_second": 4.109, + "step": 162 + }, + { + "epoch": 25.076923076923077, + "grad_norm": 6.790480613708496, + "learning_rate": 4.527777777777778e-07, + "loss": 0.4313, + "step": 163 + }, + { + "epoch": 25.23076923076923, + "grad_norm": 4.737351417541504, + "learning_rate": 4.555555555555555e-07, + "loss": 0.4303, + "step": 164 + }, + { + "epoch": 25.384615384615383, + "grad_norm": 4.519160747528076, + "learning_rate": 4.5833333333333327e-07, + "loss": 0.455, + "step": 165 + }, + { + "epoch": 25.53846153846154, + "grad_norm": 5.933927536010742, + "learning_rate": 4.611111111111111e-07, + "loss": 0.4461, + "step": 166 + }, + { + "epoch": 25.692307692307693, + "grad_norm": 7.9362664222717285, + "learning_rate": 4.6388888888888886e-07, + "loss": 0.3951, + "step": 167 + }, + { + "epoch": 25.846153846153847, + "grad_norm": 9.995780944824219, + "learning_rate": 4.6666666666666666e-07, + "loss": 0.4142, + "step": 168 + }, + { + "epoch": 26.0, + "grad_norm": 6.652438163757324, + "learning_rate": 4.694444444444444e-07, + "loss": 0.4343, + "step": 169 + }, + { + "epoch": 26.0, + "eval_accuracy": 0.7802197802197802, + "eval_loss": 0.5539770126342773, + "eval_runtime": 1.1366, + "eval_samples_per_second": 240.189, + "eval_steps_per_second": 4.399, + "step": 169 + }, + { + "epoch": 26.153846153846153, + "grad_norm": 4.573406219482422, + "learning_rate": 4.722222222222222e-07, + "loss": 0.4279, + "step": 170 + }, + { + "epoch": 26.307692307692307, + "grad_norm": 6.939181327819824, + "learning_rate": 4.7499999999999995e-07, + "loss": 0.443, + "step": 171 + }, + { + "epoch": 26.46153846153846, + "grad_norm": 10.387896537780762, + "learning_rate": 4.777777777777778e-07, + "loss": 0.4462, + "step": 172 + }, + { + "epoch": 26.615384615384617, + "grad_norm": 6.387257099151611, + "learning_rate": 4.805555555555555e-07, + "loss": 0.4221, + "step": 173 + }, + { + "epoch": 26.76923076923077, + "grad_norm": 5.768893241882324, + "learning_rate": 4.833333333333333e-07, + "loss": 0.4048, + "step": 174 + }, + { + "epoch": 26.923076923076923, + "grad_norm": 6.23684024810791, + "learning_rate": 4.861111111111111e-07, + "loss": 0.3748, + "step": 175 + }, + { + "epoch": 26.923076923076923, + "eval_accuracy": 0.7875457875457875, + "eval_loss": 0.5466815829277039, + "eval_runtime": 1.1181, + "eval_samples_per_second": 244.171, + "eval_steps_per_second": 4.472, + "step": 175 + }, + { + "epoch": 27.076923076923077, + "grad_norm": 6.4479146003723145, + "learning_rate": 4.888888888888889e-07, + "loss": 0.3923, + "step": 176 + }, + { + "epoch": 27.23076923076923, + "grad_norm": 8.554872512817383, + "learning_rate": 4.916666666666666e-07, + "loss": 0.4114, + "step": 177 + }, + { + "epoch": 27.384615384615383, + "grad_norm": 6.017930030822754, + "learning_rate": 4.944444444444445e-07, + "loss": 0.3798, + "step": 178 + }, + { + "epoch": 27.53846153846154, + "grad_norm": 8.284281730651855, + "learning_rate": 4.972222222222222e-07, + "loss": 0.4146, + "step": 179 + }, + { + "epoch": 27.692307692307693, + "grad_norm": 9.119588851928711, + "learning_rate": 5e-07, + "loss": 0.4436, + "step": 180 + }, + { + "epoch": 27.846153846153847, + "grad_norm": 9.632242202758789, + "learning_rate": 4.988095238095238e-07, + "loss": 0.4031, + "step": 181 + }, + { + "epoch": 28.0, + "grad_norm": 16.935251235961914, + "learning_rate": 4.976190476190476e-07, + "loss": 0.4041, + "step": 182 + }, + { + "epoch": 28.0, + "eval_accuracy": 0.8021978021978022, + "eval_loss": 0.5421282052993774, + "eval_runtime": 1.1482, + "eval_samples_per_second": 237.764, + "eval_steps_per_second": 4.355, + "step": 182 + }, + { + "epoch": 28.153846153846153, + "grad_norm": 5.241950511932373, + "learning_rate": 4.964285714285715e-07, + "loss": 0.4528, + "step": 183 + }, + { + "epoch": 28.307692307692307, + "grad_norm": 11.666001319885254, + "learning_rate": 4.952380952380952e-07, + "loss": 0.3889, + "step": 184 + }, + { + "epoch": 28.46153846153846, + "grad_norm": 9.493650436401367, + "learning_rate": 4.94047619047619e-07, + "loss": 0.3956, + "step": 185 + }, + { + "epoch": 28.615384615384617, + "grad_norm": 11.480545043945312, + "learning_rate": 4.928571428571429e-07, + "loss": 0.3756, + "step": 186 + }, + { + "epoch": 28.76923076923077, + "grad_norm": 18.468042373657227, + "learning_rate": 4.916666666666666e-07, + "loss": 0.401, + "step": 187 + }, + { + "epoch": 28.923076923076923, + "grad_norm": 8.127854347229004, + "learning_rate": 4.904761904761904e-07, + "loss": 0.3543, + "step": 188 + }, + { + "epoch": 28.923076923076923, + "eval_accuracy": 0.8205128205128205, + "eval_loss": 0.529083788394928, + "eval_runtime": 1.1719, + "eval_samples_per_second": 232.959, + "eval_steps_per_second": 4.267, + "step": 188 + }, + { + "epoch": 29.076923076923077, + "grad_norm": 12.729475975036621, + "learning_rate": 4.892857142857142e-07, + "loss": 0.4255, + "step": 189 + }, + { + "epoch": 29.23076923076923, + "grad_norm": 6.080268383026123, + "learning_rate": 4.880952380952381e-07, + "loss": 0.4233, + "step": 190 + }, + { + "epoch": 29.384615384615383, + "grad_norm": 9.651863098144531, + "learning_rate": 4.869047619047619e-07, + "loss": 0.3753, + "step": 191 + }, + { + "epoch": 29.53846153846154, + "grad_norm": 12.342060089111328, + "learning_rate": 4.857142857142857e-07, + "loss": 0.392, + "step": 192 + }, + { + "epoch": 29.692307692307693, + "grad_norm": 5.0623779296875, + "learning_rate": 4.845238095238095e-07, + "loss": 0.3942, + "step": 193 + }, + { + "epoch": 29.846153846153847, + "grad_norm": 18.967161178588867, + "learning_rate": 4.833333333333333e-07, + "loss": 0.3374, + "step": 194 + }, + { + "epoch": 30.0, + "grad_norm": 13.24194622039795, + "learning_rate": 4.821428571428571e-07, + "loss": 0.3972, + "step": 195 + }, + { + "epoch": 30.0, + "eval_accuracy": 0.8278388278388278, + "eval_loss": 0.5134266018867493, + "eval_runtime": 1.2397, + "eval_samples_per_second": 220.215, + "eval_steps_per_second": 4.033, + "step": 195 + }, + { + "epoch": 30.153846153846153, + "grad_norm": 12.518712997436523, + "learning_rate": 4.80952380952381e-07, + "loss": 0.4197, + "step": 196 + }, + { + "epoch": 30.307692307692307, + "grad_norm": 15.362881660461426, + "learning_rate": 4.797619047619048e-07, + "loss": 0.4056, + "step": 197 + }, + { + "epoch": 30.46153846153846, + "grad_norm": 17.072725296020508, + "learning_rate": 4.785714285714286e-07, + "loss": 0.3856, + "step": 198 + }, + { + "epoch": 30.615384615384617, + "grad_norm": 6.637291431427002, + "learning_rate": 4.773809523809523e-07, + "loss": 0.3627, + "step": 199 + }, + { + "epoch": 30.76923076923077, + "grad_norm": 8.751256942749023, + "learning_rate": 4.761904761904761e-07, + "loss": 0.3637, + "step": 200 + }, + { + "epoch": 30.923076923076923, + "grad_norm": 5.588964462280273, + "learning_rate": 4.7499999999999995e-07, + "loss": 0.3716, + "step": 201 + }, + { + "epoch": 30.923076923076923, + "eval_accuracy": 0.8241758241758241, + "eval_loss": 0.5149514079093933, + "eval_runtime": 1.1326, + "eval_samples_per_second": 241.045, + "eval_steps_per_second": 4.415, + "step": 201 + }, + { + "epoch": 31.076923076923077, + "grad_norm": 8.8916654586792, + "learning_rate": 4.738095238095238e-07, + "loss": 0.362, + "step": 202 + }, + { + "epoch": 31.23076923076923, + "grad_norm": 7.172123908996582, + "learning_rate": 4.7261904761904756e-07, + "loss": 0.3541, + "step": 203 + }, + { + "epoch": 31.384615384615383, + "grad_norm": 9.399160385131836, + "learning_rate": 4.714285714285714e-07, + "loss": 0.3629, + "step": 204 + }, + { + "epoch": 31.53846153846154, + "grad_norm": 12.05125904083252, + "learning_rate": 4.702380952380952e-07, + "loss": 0.3414, + "step": 205 + }, + { + "epoch": 31.692307692307693, + "grad_norm": 6.808493137359619, + "learning_rate": 4.69047619047619e-07, + "loss": 0.372, + "step": 206 + }, + { + "epoch": 31.846153846153847, + "grad_norm": 12.759626388549805, + "learning_rate": 4.6785714285714283e-07, + "loss": 0.3574, + "step": 207 + }, + { + "epoch": 32.0, + "grad_norm": 9.680743217468262, + "learning_rate": 4.6666666666666666e-07, + "loss": 0.3871, + "step": 208 + }, + { + "epoch": 32.0, + "eval_accuracy": 0.8315018315018315, + "eval_loss": 0.5100430846214294, + "eval_runtime": 1.1335, + "eval_samples_per_second": 240.838, + "eval_steps_per_second": 4.411, + "step": 208 + }, + { + "epoch": 32.15384615384615, + "grad_norm": 13.226339340209961, + "learning_rate": 4.6547619047619044e-07, + "loss": 0.3623, + "step": 209 + }, + { + "epoch": 32.30769230769231, + "grad_norm": 9.359790802001953, + "learning_rate": 4.6428571428571427e-07, + "loss": 0.4045, + "step": 210 + }, + { + "epoch": 32.46153846153846, + "grad_norm": 4.744467258453369, + "learning_rate": 4.630952380952381e-07, + "loss": 0.3852, + "step": 211 + }, + { + "epoch": 32.61538461538461, + "grad_norm": 9.221460342407227, + "learning_rate": 4.6190476190476193e-07, + "loss": 0.3267, + "step": 212 + }, + { + "epoch": 32.76923076923077, + "grad_norm": 5.6791510581970215, + "learning_rate": 4.6071428571428566e-07, + "loss": 0.3969, + "step": 213 + }, + { + "epoch": 32.92307692307692, + "grad_norm": 13.916045188903809, + "learning_rate": 4.595238095238095e-07, + "loss": 0.3729, + "step": 214 + }, + { + "epoch": 32.92307692307692, + "eval_accuracy": 0.8351648351648352, + "eval_loss": 0.4985570013523102, + "eval_runtime": 1.1523, + "eval_samples_per_second": 236.921, + "eval_steps_per_second": 4.339, + "step": 214 + }, + { + "epoch": 33.07692307692308, + "grad_norm": 20.20587158203125, + "learning_rate": 4.5833333333333327e-07, + "loss": 0.3999, + "step": 215 + }, + { + "epoch": 33.23076923076923, + "grad_norm": 5.628478050231934, + "learning_rate": 4.571428571428571e-07, + "loss": 0.333, + "step": 216 + }, + { + "epoch": 33.38461538461539, + "grad_norm": 9.061274528503418, + "learning_rate": 4.5595238095238093e-07, + "loss": 0.3807, + "step": 217 + }, + { + "epoch": 33.53846153846154, + "grad_norm": 7.302306652069092, + "learning_rate": 4.5476190476190476e-07, + "loss": 0.3309, + "step": 218 + }, + { + "epoch": 33.69230769230769, + "grad_norm": 17.617534637451172, + "learning_rate": 4.5357142857142854e-07, + "loss": 0.319, + "step": 219 + }, + { + "epoch": 33.84615384615385, + "grad_norm": 9.831780433654785, + "learning_rate": 4.5238095238095237e-07, + "loss": 0.3849, + "step": 220 + }, + { + "epoch": 34.0, + "grad_norm": 10.667181968688965, + "learning_rate": 4.511904761904762e-07, + "loss": 0.3286, + "step": 221 + }, + { + "epoch": 34.0, + "eval_accuracy": 0.8461538461538461, + "eval_loss": 0.4945792853832245, + "eval_runtime": 1.1795, + "eval_samples_per_second": 231.451, + "eval_steps_per_second": 4.239, + "step": 221 + }, + { + "epoch": 34.15384615384615, + "grad_norm": 6.82171106338501, + "learning_rate": 4.5e-07, + "loss": 0.3129, + "step": 222 + }, + { + "epoch": 34.30769230769231, + "grad_norm": 12.462568283081055, + "learning_rate": 4.488095238095238e-07, + "loss": 0.2947, + "step": 223 + }, + { + "epoch": 34.46153846153846, + "grad_norm": 14.821395874023438, + "learning_rate": 4.4761904761904764e-07, + "loss": 0.381, + "step": 224 + }, + { + "epoch": 34.61538461538461, + "grad_norm": 11.688921928405762, + "learning_rate": 4.464285714285714e-07, + "loss": 0.392, + "step": 225 + }, + { + "epoch": 34.76923076923077, + "grad_norm": 16.245473861694336, + "learning_rate": 4.452380952380952e-07, + "loss": 0.3356, + "step": 226 + }, + { + "epoch": 34.92307692307692, + "grad_norm": 8.85981559753418, + "learning_rate": 4.4404761904761903e-07, + "loss": 0.4261, + "step": 227 + }, + { + "epoch": 34.92307692307692, + "eval_accuracy": 0.8388278388278388, + "eval_loss": 0.49569690227508545, + "eval_runtime": 1.1615, + "eval_samples_per_second": 235.046, + "eval_steps_per_second": 4.305, + "step": 227 + }, + { + "epoch": 35.07692307692308, + "grad_norm": 14.444450378417969, + "learning_rate": 4.428571428571428e-07, + "loss": 0.3065, + "step": 228 + }, + { + "epoch": 35.23076923076923, + "grad_norm": 18.541534423828125, + "learning_rate": 4.4166666666666664e-07, + "loss": 0.319, + "step": 229 + }, + { + "epoch": 35.38461538461539, + "grad_norm": 14.79796314239502, + "learning_rate": 4.4047619047619047e-07, + "loss": 0.3511, + "step": 230 + }, + { + "epoch": 35.53846153846154, + "grad_norm": 14.025275230407715, + "learning_rate": 4.3928571428571425e-07, + "loss": 0.351, + "step": 231 + }, + { + "epoch": 35.69230769230769, + "grad_norm": 9.698802947998047, + "learning_rate": 4.380952380952381e-07, + "loss": 0.3369, + "step": 232 + }, + { + "epoch": 35.84615384615385, + "grad_norm": 10.56847858428955, + "learning_rate": 4.369047619047619e-07, + "loss": 0.2976, + "step": 233 + }, + { + "epoch": 36.0, + "grad_norm": 15.778355598449707, + "learning_rate": 4.357142857142857e-07, + "loss": 0.4014, + "step": 234 + }, + { + "epoch": 36.0, + "eval_accuracy": 0.8534798534798534, + "eval_loss": 0.48495998978614807, + "eval_runtime": 1.1929, + "eval_samples_per_second": 228.863, + "eval_steps_per_second": 4.192, + "step": 234 + }, + { + "epoch": 36.15384615384615, + "grad_norm": 16.71240234375, + "learning_rate": 4.345238095238095e-07, + "loss": 0.3749, + "step": 235 + }, + { + "epoch": 36.30769230769231, + "grad_norm": 13.584702491760254, + "learning_rate": 4.3333333333333335e-07, + "loss": 0.3812, + "step": 236 + }, + { + "epoch": 36.46153846153846, + "grad_norm": 8.108072280883789, + "learning_rate": 4.3214285714285713e-07, + "loss": 0.3024, + "step": 237 + }, + { + "epoch": 36.61538461538461, + "grad_norm": 9.233282089233398, + "learning_rate": 4.3095238095238096e-07, + "loss": 0.3413, + "step": 238 + }, + { + "epoch": 36.76923076923077, + "grad_norm": 13.080716133117676, + "learning_rate": 4.297619047619048e-07, + "loss": 0.2792, + "step": 239 + }, + { + "epoch": 36.92307692307692, + "grad_norm": 10.88381576538086, + "learning_rate": 4.285714285714285e-07, + "loss": 0.3514, + "step": 240 + }, + { + "epoch": 36.92307692307692, + "eval_accuracy": 0.8534798534798534, + "eval_loss": 0.4806550443172455, + "eval_runtime": 1.1462, + "eval_samples_per_second": 238.179, + "eval_steps_per_second": 4.362, + "step": 240 + }, + { + "epoch": 37.07692307692308, + "grad_norm": 13.345056533813477, + "learning_rate": 4.2738095238095235e-07, + "loss": 0.318, + "step": 241 + }, + { + "epoch": 37.23076923076923, + "grad_norm": 9.593791961669922, + "learning_rate": 4.261904761904762e-07, + "loss": 0.3487, + "step": 242 + }, + { + "epoch": 37.38461538461539, + "grad_norm": 9.83149242401123, + "learning_rate": 4.2499999999999995e-07, + "loss": 0.3283, + "step": 243 + }, + { + "epoch": 37.53846153846154, + "grad_norm": 11.976517677307129, + "learning_rate": 4.238095238095238e-07, + "loss": 0.407, + "step": 244 + }, + { + "epoch": 37.69230769230769, + "grad_norm": 7.24540901184082, + "learning_rate": 4.226190476190476e-07, + "loss": 0.3899, + "step": 245 + }, + { + "epoch": 37.84615384615385, + "grad_norm": 11.755511283874512, + "learning_rate": 4.214285714285714e-07, + "loss": 0.247, + "step": 246 + }, + { + "epoch": 38.0, + "grad_norm": 14.939422607421875, + "learning_rate": 4.202380952380952e-07, + "loss": 0.3883, + "step": 247 + }, + { + "epoch": 38.0, + "eval_accuracy": 0.8534798534798534, + "eval_loss": 0.47668036818504333, + "eval_runtime": 1.2822, + "eval_samples_per_second": 212.923, + "eval_steps_per_second": 3.9, + "step": 247 + }, + { + "epoch": 38.15384615384615, + "grad_norm": 7.6719279289245605, + "learning_rate": 4.1904761904761906e-07, + "loss": 0.3579, + "step": 248 + }, + { + "epoch": 38.30769230769231, + "grad_norm": 18.015718460083008, + "learning_rate": 4.1785714285714283e-07, + "loss": 0.3072, + "step": 249 + }, + { + "epoch": 38.46153846153846, + "grad_norm": 13.246123313903809, + "learning_rate": 4.1666666666666667e-07, + "loss": 0.3756, + "step": 250 + }, + { + "epoch": 38.61538461538461, + "grad_norm": 7.806217670440674, + "learning_rate": 4.154761904761905e-07, + "loss": 0.3919, + "step": 251 + }, + { + "epoch": 38.76923076923077, + "grad_norm": 5.912841796875, + "learning_rate": 4.142857142857143e-07, + "loss": 0.3079, + "step": 252 + }, + { + "epoch": 38.92307692307692, + "grad_norm": 7.757283687591553, + "learning_rate": 4.1309523809523805e-07, + "loss": 0.3219, + "step": 253 + }, + { + "epoch": 38.92307692307692, + "eval_accuracy": 0.8534798534798534, + "eval_loss": 0.4762944281101227, + "eval_runtime": 1.2149, + "eval_samples_per_second": 224.702, + "eval_steps_per_second": 4.115, + "step": 253 + }, + { + "epoch": 39.07692307692308, + "grad_norm": 7.3901238441467285, + "learning_rate": 4.119047619047619e-07, + "loss": 0.2908, + "step": 254 + }, + { + "epoch": 39.23076923076923, + "grad_norm": 6.3056535720825195, + "learning_rate": 4.1071428571428566e-07, + "loss": 0.2982, + "step": 255 + }, + { + "epoch": 39.38461538461539, + "grad_norm": 9.348855018615723, + "learning_rate": 4.095238095238095e-07, + "loss": 0.3693, + "step": 256 + }, + { + "epoch": 39.53846153846154, + "grad_norm": 7.346530914306641, + "learning_rate": 4.083333333333333e-07, + "loss": 0.3475, + "step": 257 + }, + { + "epoch": 39.69230769230769, + "grad_norm": 5.94871711730957, + "learning_rate": 4.071428571428571e-07, + "loss": 0.3274, + "step": 258 + }, + { + "epoch": 39.84615384615385, + "grad_norm": 35.71484375, + "learning_rate": 4.0595238095238093e-07, + "loss": 0.3718, + "step": 259 + }, + { + "epoch": 40.0, + "grad_norm": 13.31654167175293, + "learning_rate": 4.0476190476190476e-07, + "loss": 0.4351, + "step": 260 + }, + { + "epoch": 40.0, + "eval_accuracy": 0.8571428571428571, + "eval_loss": 0.47377410531044006, + "eval_runtime": 1.1651, + "eval_samples_per_second": 234.318, + "eval_steps_per_second": 4.292, + "step": 260 + }, + { + "epoch": 40.15384615384615, + "grad_norm": 8.30079460144043, + "learning_rate": 4.0357142857142854e-07, + "loss": 0.4084, + "step": 261 + }, + { + "epoch": 40.30769230769231, + "grad_norm": 11.171014785766602, + "learning_rate": 4.0238095238095237e-07, + "loss": 0.2589, + "step": 262 + }, + { + "epoch": 40.46153846153846, + "grad_norm": 12.395658493041992, + "learning_rate": 4.011904761904762e-07, + "loss": 0.3645, + "step": 263 + }, + { + "epoch": 40.61538461538461, + "grad_norm": 12.52223014831543, + "learning_rate": 4e-07, + "loss": 0.3063, + "step": 264 + }, + { + "epoch": 40.76923076923077, + "grad_norm": 22.095630645751953, + "learning_rate": 3.988095238095238e-07, + "loss": 0.2982, + "step": 265 + }, + { + "epoch": 40.92307692307692, + "grad_norm": 19.723215103149414, + "learning_rate": 3.976190476190476e-07, + "loss": 0.3068, + "step": 266 + }, + { + "epoch": 40.92307692307692, + "eval_accuracy": 0.8644688644688645, + "eval_loss": 0.46877339482307434, + "eval_runtime": 1.1413, + "eval_samples_per_second": 239.204, + "eval_steps_per_second": 4.381, + "step": 266 + }, + { + "epoch": 41.07692307692308, + "grad_norm": 14.390270233154297, + "learning_rate": 3.9642857142857137e-07, + "loss": 0.3193, + "step": 267 + }, + { + "epoch": 41.23076923076923, + "grad_norm": 14.494707107543945, + "learning_rate": 3.952380952380952e-07, + "loss": 0.3274, + "step": 268 + }, + { + "epoch": 41.38461538461539, + "grad_norm": 9.31578540802002, + "learning_rate": 3.9404761904761903e-07, + "loss": 0.2905, + "step": 269 + }, + { + "epoch": 41.53846153846154, + "grad_norm": 11.39842700958252, + "learning_rate": 3.928571428571428e-07, + "loss": 0.2591, + "step": 270 + }, + { + "epoch": 41.69230769230769, + "grad_norm": 12.236638069152832, + "learning_rate": 3.9166666666666664e-07, + "loss": 0.3198, + "step": 271 + }, + { + "epoch": 41.84615384615385, + "grad_norm": 14.803117752075195, + "learning_rate": 3.9047619047619047e-07, + "loss": 0.3718, + "step": 272 + }, + { + "epoch": 42.0, + "grad_norm": 12.712557792663574, + "learning_rate": 3.8928571428571425e-07, + "loss": 0.3356, + "step": 273 + }, + { + "epoch": 42.0, + "eval_accuracy": 0.8644688644688645, + "eval_loss": 0.45851626992225647, + "eval_runtime": 1.178, + "eval_samples_per_second": 231.744, + "eval_steps_per_second": 4.244, + "step": 273 + }, + { + "epoch": 42.15384615384615, + "grad_norm": 5.399446964263916, + "learning_rate": 3.880952380952381e-07, + "loss": 0.3028, + "step": 274 + }, + { + "epoch": 42.30769230769231, + "grad_norm": 9.010210990905762, + "learning_rate": 3.869047619047619e-07, + "loss": 0.2897, + "step": 275 + }, + { + "epoch": 42.46153846153846, + "grad_norm": 8.666064262390137, + "learning_rate": 3.857142857142857e-07, + "loss": 0.3349, + "step": 276 + }, + { + "epoch": 42.61538461538461, + "grad_norm": 25.002635955810547, + "learning_rate": 3.845238095238095e-07, + "loss": 0.3166, + "step": 277 + }, + { + "epoch": 42.76923076923077, + "grad_norm": 13.861302375793457, + "learning_rate": 3.8333333333333335e-07, + "loss": 0.3475, + "step": 278 + }, + { + "epoch": 42.92307692307692, + "grad_norm": 11.409740447998047, + "learning_rate": 3.821428571428571e-07, + "loss": 0.345, + "step": 279 + }, + { + "epoch": 42.92307692307692, + "eval_accuracy": 0.8681318681318682, + "eval_loss": 0.4540693759918213, + "eval_runtime": 1.1168, + "eval_samples_per_second": 244.446, + "eval_steps_per_second": 4.477, + "step": 279 + }, + { + "epoch": 43.07692307692308, + "grad_norm": 21.283527374267578, + "learning_rate": 3.809523809523809e-07, + "loss": 0.3316, + "step": 280 + }, + { + "epoch": 43.23076923076923, + "grad_norm": 7.409358978271484, + "learning_rate": 3.7976190476190474e-07, + "loss": 0.3305, + "step": 281 + }, + { + "epoch": 43.38461538461539, + "grad_norm": 11.763964653015137, + "learning_rate": 3.785714285714285e-07, + "loss": 0.3293, + "step": 282 + }, + { + "epoch": 43.53846153846154, + "grad_norm": 5.29448127746582, + "learning_rate": 3.7738095238095235e-07, + "loss": 0.2883, + "step": 283 + }, + { + "epoch": 43.69230769230769, + "grad_norm": 16.18635368347168, + "learning_rate": 3.761904761904762e-07, + "loss": 0.3549, + "step": 284 + }, + { + "epoch": 43.84615384615385, + "grad_norm": 17.321565628051758, + "learning_rate": 3.75e-07, + "loss": 0.3016, + "step": 285 + }, + { + "epoch": 44.0, + "grad_norm": 8.436405181884766, + "learning_rate": 3.738095238095238e-07, + "loss": 0.3254, + "step": 286 + }, + { + "epoch": 44.0, + "eval_accuracy": 0.8644688644688645, + "eval_loss": 0.45843759179115295, + "eval_runtime": 1.1597, + "eval_samples_per_second": 235.408, + "eval_steps_per_second": 4.311, + "step": 286 + }, + { + "epoch": 44.15384615384615, + "grad_norm": 11.177824974060059, + "learning_rate": 3.726190476190476e-07, + "loss": 0.3276, + "step": 287 + }, + { + "epoch": 44.30769230769231, + "grad_norm": 10.337651252746582, + "learning_rate": 3.7142857142857145e-07, + "loss": 0.3419, + "step": 288 + }, + { + "epoch": 44.46153846153846, + "grad_norm": 21.42737579345703, + "learning_rate": 3.7023809523809523e-07, + "loss": 0.2689, + "step": 289 + }, + { + "epoch": 44.61538461538461, + "grad_norm": 11.9132661819458, + "learning_rate": 3.6904761904761906e-07, + "loss": 0.3776, + "step": 290 + }, + { + "epoch": 44.76923076923077, + "grad_norm": 14.86318302154541, + "learning_rate": 3.678571428571429e-07, + "loss": 0.3984, + "step": 291 + }, + { + "epoch": 44.92307692307692, + "grad_norm": 15.56070327758789, + "learning_rate": 3.666666666666666e-07, + "loss": 0.3164, + "step": 292 + }, + { + "epoch": 44.92307692307692, + "eval_accuracy": 0.8571428571428571, + "eval_loss": 0.4591527581214905, + "eval_runtime": 1.1076, + "eval_samples_per_second": 246.479, + "eval_steps_per_second": 4.514, + "step": 292 + }, + { + "epoch": 45.07692307692308, + "grad_norm": 14.275500297546387, + "learning_rate": 3.6547619047619045e-07, + "loss": 0.2905, + "step": 293 + }, + { + "epoch": 45.23076923076923, + "grad_norm": 11.206035614013672, + "learning_rate": 3.642857142857143e-07, + "loss": 0.2992, + "step": 294 + }, + { + "epoch": 45.38461538461539, + "grad_norm": 10.007750511169434, + "learning_rate": 3.6309523809523805e-07, + "loss": 0.3662, + "step": 295 + }, + { + "epoch": 45.53846153846154, + "grad_norm": 13.443836212158203, + "learning_rate": 3.619047619047619e-07, + "loss": 0.3129, + "step": 296 + }, + { + "epoch": 45.69230769230769, + "grad_norm": 22.08678436279297, + "learning_rate": 3.607142857142857e-07, + "loss": 0.3904, + "step": 297 + }, + { + "epoch": 45.84615384615385, + "grad_norm": 24.27136993408203, + "learning_rate": 3.595238095238095e-07, + "loss": 0.3193, + "step": 298 + }, + { + "epoch": 46.0, + "grad_norm": 9.215178489685059, + "learning_rate": 3.583333333333333e-07, + "loss": 0.3657, + "step": 299 + }, + { + "epoch": 46.0, + "eval_accuracy": 0.8608058608058609, + "eval_loss": 0.4533578157424927, + "eval_runtime": 1.1346, + "eval_samples_per_second": 240.624, + "eval_steps_per_second": 4.407, + "step": 299 + }, + { + "epoch": 46.15384615384615, + "grad_norm": 13.869656562805176, + "learning_rate": 3.5714285714285716e-07, + "loss": 0.3241, + "step": 300 + }, + { + "epoch": 46.30769230769231, + "grad_norm": 13.870816230773926, + "learning_rate": 3.5595238095238094e-07, + "loss": 0.294, + "step": 301 + }, + { + "epoch": 46.46153846153846, + "grad_norm": 5.2440338134765625, + "learning_rate": 3.5476190476190477e-07, + "loss": 0.3046, + "step": 302 + }, + { + "epoch": 46.61538461538461, + "grad_norm": 11.387068748474121, + "learning_rate": 3.535714285714286e-07, + "loss": 0.4067, + "step": 303 + }, + { + "epoch": 46.76923076923077, + "grad_norm": 10.643122673034668, + "learning_rate": 3.523809523809524e-07, + "loss": 0.3217, + "step": 304 + }, + { + "epoch": 46.92307692307692, + "grad_norm": 21.845155715942383, + "learning_rate": 3.5119047619047615e-07, + "loss": 0.2655, + "step": 305 + }, + { + "epoch": 46.92307692307692, + "eval_accuracy": 0.8644688644688645, + "eval_loss": 0.4501632750034332, + "eval_runtime": 1.1413, + "eval_samples_per_second": 239.204, + "eval_steps_per_second": 4.381, + "step": 305 + }, + { + "epoch": 47.07692307692308, + "grad_norm": 12.1947603225708, + "learning_rate": 3.5e-07, + "loss": 0.2994, + "step": 306 + }, + { + "epoch": 47.23076923076923, + "grad_norm": 21.314617156982422, + "learning_rate": 3.4880952380952376e-07, + "loss": 0.2751, + "step": 307 + }, + { + "epoch": 47.38461538461539, + "grad_norm": 5.636424541473389, + "learning_rate": 3.476190476190476e-07, + "loss": 0.292, + "step": 308 + }, + { + "epoch": 47.53846153846154, + "grad_norm": 11.540352821350098, + "learning_rate": 3.464285714285714e-07, + "loss": 0.3693, + "step": 309 + }, + { + "epoch": 47.69230769230769, + "grad_norm": 9.435784339904785, + "learning_rate": 3.452380952380952e-07, + "loss": 0.334, + "step": 310 + }, + { + "epoch": 47.84615384615385, + "grad_norm": 14.798314094543457, + "learning_rate": 3.4404761904761903e-07, + "loss": 0.2237, + "step": 311 + }, + { + "epoch": 48.0, + "grad_norm": 10.26159954071045, + "learning_rate": 3.4285714285714286e-07, + "loss": 0.2981, + "step": 312 + }, + { + "epoch": 48.0, + "eval_accuracy": 0.8644688644688645, + "eval_loss": 0.4451758563518524, + "eval_runtime": 1.2162, + "eval_samples_per_second": 224.473, + "eval_steps_per_second": 4.111, + "step": 312 + }, + { + "epoch": 48.15384615384615, + "grad_norm": 19.33696937561035, + "learning_rate": 3.4166666666666664e-07, + "loss": 0.2414, + "step": 313 + }, + { + "epoch": 48.30769230769231, + "grad_norm": 8.318500518798828, + "learning_rate": 3.4047619047619047e-07, + "loss": 0.3193, + "step": 314 + }, + { + "epoch": 48.46153846153846, + "grad_norm": 19.92133140563965, + "learning_rate": 3.392857142857143e-07, + "loss": 0.3218, + "step": 315 + }, + { + "epoch": 48.61538461538461, + "grad_norm": 18.465848922729492, + "learning_rate": 3.380952380952381e-07, + "loss": 0.3046, + "step": 316 + }, + { + "epoch": 48.76923076923077, + "grad_norm": 20.254159927368164, + "learning_rate": 3.369047619047619e-07, + "loss": 0.3651, + "step": 317 + }, + { + "epoch": 48.92307692307692, + "grad_norm": 9.707018852233887, + "learning_rate": 3.357142857142857e-07, + "loss": 0.3508, + "step": 318 + }, + { + "epoch": 48.92307692307692, + "eval_accuracy": 0.8791208791208791, + "eval_loss": 0.4371393620967865, + "eval_runtime": 1.2035, + "eval_samples_per_second": 226.845, + "eval_steps_per_second": 4.155, + "step": 318 + }, + { + "epoch": 49.07692307692308, + "grad_norm": 9.588353157043457, + "learning_rate": 3.3452380952380947e-07, + "loss": 0.4015, + "step": 319 + }, + { + "epoch": 49.23076923076923, + "grad_norm": 19.106985092163086, + "learning_rate": 3.333333333333333e-07, + "loss": 0.2461, + "step": 320 + }, + { + "epoch": 49.38461538461539, + "grad_norm": 18.668371200561523, + "learning_rate": 3.3214285714285713e-07, + "loss": 0.3201, + "step": 321 + }, + { + "epoch": 49.53846153846154, + "grad_norm": 22.97618865966797, + "learning_rate": 3.309523809523809e-07, + "loss": 0.2973, + "step": 322 + }, + { + "epoch": 49.69230769230769, + "grad_norm": 15.057040214538574, + "learning_rate": 3.2976190476190474e-07, + "loss": 0.2931, + "step": 323 + }, + { + "epoch": 49.84615384615385, + "grad_norm": 9.635611534118652, + "learning_rate": 3.2857142857142857e-07, + "loss": 0.3173, + "step": 324 + }, + { + "epoch": 50.0, + "grad_norm": 8.364943504333496, + "learning_rate": 3.2738095238095235e-07, + "loss": 0.3419, + "step": 325 + }, + { + "epoch": 50.0, + "eval_accuracy": 0.8754578754578755, + "eval_loss": 0.43940499424934387, + "eval_runtime": 1.1837, + "eval_samples_per_second": 230.634, + "eval_steps_per_second": 4.224, + "step": 325 + }, + { + "epoch": 50.15384615384615, + "grad_norm": 10.205245018005371, + "learning_rate": 3.261904761904762e-07, + "loss": 0.3744, + "step": 326 + }, + { + "epoch": 50.30769230769231, + "grad_norm": 8.429767608642578, + "learning_rate": 3.25e-07, + "loss": 0.2852, + "step": 327 + }, + { + "epoch": 50.46153846153846, + "grad_norm": 19.509811401367188, + "learning_rate": 3.238095238095238e-07, + "loss": 0.283, + "step": 328 + }, + { + "epoch": 50.61538461538461, + "grad_norm": 12.072210311889648, + "learning_rate": 3.226190476190476e-07, + "loss": 0.2955, + "step": 329 + }, + { + "epoch": 50.76923076923077, + "grad_norm": 19.032461166381836, + "learning_rate": 3.2142857142857145e-07, + "loss": 0.351, + "step": 330 + }, + { + "epoch": 50.92307692307692, + "grad_norm": 24.8001708984375, + "learning_rate": 3.202380952380952e-07, + "loss": 0.2668, + "step": 331 + }, + { + "epoch": 50.92307692307692, + "eval_accuracy": 0.8754578754578755, + "eval_loss": 0.4430113434791565, + "eval_runtime": 1.1828, + "eval_samples_per_second": 230.809, + "eval_steps_per_second": 4.227, + "step": 331 + }, + { + "epoch": 51.07692307692308, + "grad_norm": 7.619977951049805, + "learning_rate": 3.19047619047619e-07, + "loss": 0.2316, + "step": 332 + }, + { + "epoch": 51.23076923076923, + "grad_norm": 15.534941673278809, + "learning_rate": 3.1785714285714284e-07, + "loss": 0.2948, + "step": 333 + }, + { + "epoch": 51.38461538461539, + "grad_norm": 6.369411945343018, + "learning_rate": 3.166666666666666e-07, + "loss": 0.2319, + "step": 334 + }, + { + "epoch": 51.53846153846154, + "grad_norm": 12.510886192321777, + "learning_rate": 3.1547619047619045e-07, + "loss": 0.3389, + "step": 335 + }, + { + "epoch": 51.69230769230769, + "grad_norm": 9.731184005737305, + "learning_rate": 3.142857142857143e-07, + "loss": 0.2822, + "step": 336 + }, + { + "epoch": 51.84615384615385, + "grad_norm": 6.847411155700684, + "learning_rate": 3.1309523809523806e-07, + "loss": 0.3447, + "step": 337 + }, + { + "epoch": 52.0, + "grad_norm": 15.400504112243652, + "learning_rate": 3.119047619047619e-07, + "loss": 0.2972, + "step": 338 + }, + { + "epoch": 52.0, + "eval_accuracy": 0.8717948717948718, + "eval_loss": 0.43954789638519287, + "eval_runtime": 1.1577, + "eval_samples_per_second": 235.817, + "eval_steps_per_second": 4.319, + "step": 338 + }, + { + "epoch": 52.15384615384615, + "grad_norm": 14.40497875213623, + "learning_rate": 3.107142857142857e-07, + "loss": 0.2947, + "step": 339 + }, + { + "epoch": 52.30769230769231, + "grad_norm": 12.60912799835205, + "learning_rate": 3.095238095238095e-07, + "loss": 0.2866, + "step": 340 + }, + { + "epoch": 52.46153846153846, + "grad_norm": 10.782893180847168, + "learning_rate": 3.0833333333333333e-07, + "loss": 0.275, + "step": 341 + }, + { + "epoch": 52.61538461538461, + "grad_norm": 14.848359107971191, + "learning_rate": 3.0714285714285716e-07, + "loss": 0.3377, + "step": 342 + }, + { + "epoch": 52.76923076923077, + "grad_norm": 16.875308990478516, + "learning_rate": 3.0595238095238094e-07, + "loss": 0.2871, + "step": 343 + }, + { + "epoch": 52.92307692307692, + "grad_norm": 10.62590217590332, + "learning_rate": 3.0476190476190477e-07, + "loss": 0.3514, + "step": 344 + }, + { + "epoch": 52.92307692307692, + "eval_accuracy": 0.8754578754578755, + "eval_loss": 0.437090665102005, + "eval_runtime": 1.1411, + "eval_samples_per_second": 239.24, + "eval_steps_per_second": 4.382, + "step": 344 + }, + { + "epoch": 53.07692307692308, + "grad_norm": 19.662609100341797, + "learning_rate": 3.0357142857142855e-07, + "loss": 0.2457, + "step": 345 + }, + { + "epoch": 53.23076923076923, + "grad_norm": 10.951351165771484, + "learning_rate": 3.023809523809523e-07, + "loss": 0.2542, + "step": 346 + }, + { + "epoch": 53.38461538461539, + "grad_norm": 6.810473918914795, + "learning_rate": 3.0119047619047616e-07, + "loss": 0.2873, + "step": 347 + }, + { + "epoch": 53.53846153846154, + "grad_norm": 11.747807502746582, + "learning_rate": 3e-07, + "loss": 0.3965, + "step": 348 + }, + { + "epoch": 53.69230769230769, + "grad_norm": 12.671740531921387, + "learning_rate": 2.9880952380952376e-07, + "loss": 0.3395, + "step": 349 + }, + { + "epoch": 53.84615384615385, + "grad_norm": 10.57718276977539, + "learning_rate": 2.976190476190476e-07, + "loss": 0.3071, + "step": 350 + }, + { + "epoch": 54.0, + "grad_norm": 7.219900131225586, + "learning_rate": 2.9642857142857143e-07, + "loss": 0.3012, + "step": 351 + }, + { + "epoch": 54.0, + "eval_accuracy": 0.8791208791208791, + "eval_loss": 0.43296605348587036, + "eval_runtime": 1.1556, + "eval_samples_per_second": 236.25, + "eval_steps_per_second": 4.327, + "step": 351 + }, + { + "epoch": 54.15384615384615, + "grad_norm": 7.588340759277344, + "learning_rate": 2.952380952380952e-07, + "loss": 0.3188, + "step": 352 + }, + { + "epoch": 54.30769230769231, + "grad_norm": 20.165128707885742, + "learning_rate": 2.9404761904761904e-07, + "loss": 0.377, + "step": 353 + }, + { + "epoch": 54.46153846153846, + "grad_norm": 9.232548713684082, + "learning_rate": 2.9285714285714287e-07, + "loss": 0.276, + "step": 354 + }, + { + "epoch": 54.61538461538461, + "grad_norm": 8.916671752929688, + "learning_rate": 2.916666666666667e-07, + "loss": 0.3599, + "step": 355 + }, + { + "epoch": 54.76923076923077, + "grad_norm": 11.789280891418457, + "learning_rate": 2.904761904761905e-07, + "loss": 0.3004, + "step": 356 + }, + { + "epoch": 54.92307692307692, + "grad_norm": 8.527569770812988, + "learning_rate": 2.892857142857143e-07, + "loss": 0.2725, + "step": 357 + }, + { + "epoch": 54.92307692307692, + "eval_accuracy": 0.8791208791208791, + "eval_loss": 0.4297783672809601, + "eval_runtime": 1.1219, + "eval_samples_per_second": 243.346, + "eval_steps_per_second": 4.457, + "step": 357 + }, + { + "epoch": 55.07692307692308, + "grad_norm": 9.423765182495117, + "learning_rate": 2.8809523809523803e-07, + "loss": 0.2731, + "step": 358 + }, + { + "epoch": 55.23076923076923, + "grad_norm": 13.653554916381836, + "learning_rate": 2.8690476190476186e-07, + "loss": 0.4064, + "step": 359 + }, + { + "epoch": 55.38461538461539, + "grad_norm": 7.217813491821289, + "learning_rate": 2.857142857142857e-07, + "loss": 0.305, + "step": 360 + }, + { + "epoch": 55.53846153846154, + "grad_norm": 7.166869640350342, + "learning_rate": 2.845238095238095e-07, + "loss": 0.2488, + "step": 361 + }, + { + "epoch": 55.69230769230769, + "grad_norm": 6.370850086212158, + "learning_rate": 2.833333333333333e-07, + "loss": 0.2946, + "step": 362 + }, + { + "epoch": 55.84615384615385, + "grad_norm": 19.36492919921875, + "learning_rate": 2.8214285714285713e-07, + "loss": 0.3314, + "step": 363 + }, + { + "epoch": 56.0, + "grad_norm": 10.563324928283691, + "learning_rate": 2.8095238095238096e-07, + "loss": 0.2547, + "step": 364 + }, + { + "epoch": 56.0, + "eval_accuracy": 0.8717948717948718, + "eval_loss": 0.4288838803768158, + "eval_runtime": 1.1679, + "eval_samples_per_second": 233.746, + "eval_steps_per_second": 4.281, + "step": 364 + }, + { + "epoch": 56.15384615384615, + "grad_norm": 10.190791130065918, + "learning_rate": 2.7976190476190474e-07, + "loss": 0.2646, + "step": 365 + }, + { + "epoch": 56.30769230769231, + "grad_norm": 16.170412063598633, + "learning_rate": 2.785714285714286e-07, + "loss": 0.3392, + "step": 366 + }, + { + "epoch": 56.46153846153846, + "grad_norm": 7.313807964324951, + "learning_rate": 2.773809523809524e-07, + "loss": 0.2909, + "step": 367 + }, + { + "epoch": 56.61538461538461, + "grad_norm": 9.653914451599121, + "learning_rate": 2.761904761904762e-07, + "loss": 0.3295, + "step": 368 + }, + { + "epoch": 56.76923076923077, + "grad_norm": 6.966893672943115, + "learning_rate": 2.75e-07, + "loss": 0.2956, + "step": 369 + }, + { + "epoch": 56.92307692307692, + "grad_norm": 15.753593444824219, + "learning_rate": 2.7380952380952385e-07, + "loss": 0.2896, + "step": 370 + }, + { + "epoch": 56.92307692307692, + "eval_accuracy": 0.8717948717948718, + "eval_loss": 0.4282112419605255, + "eval_runtime": 1.1522, + "eval_samples_per_second": 236.937, + "eval_steps_per_second": 4.339, + "step": 370 + }, + { + "epoch": 57.07692307692308, + "grad_norm": 13.775495529174805, + "learning_rate": 2.7261904761904757e-07, + "loss": 0.386, + "step": 371 + }, + { + "epoch": 57.23076923076923, + "grad_norm": 6.003649711608887, + "learning_rate": 2.714285714285714e-07, + "loss": 0.2473, + "step": 372 + }, + { + "epoch": 57.38461538461539, + "grad_norm": 22.614078521728516, + "learning_rate": 2.7023809523809523e-07, + "loss": 0.4098, + "step": 373 + }, + { + "epoch": 57.53846153846154, + "grad_norm": 15.326905250549316, + "learning_rate": 2.69047619047619e-07, + "loss": 0.2628, + "step": 374 + }, + { + "epoch": 57.69230769230769, + "grad_norm": 6.482524871826172, + "learning_rate": 2.6785714285714284e-07, + "loss": 0.3312, + "step": 375 + }, + { + "epoch": 57.84615384615385, + "grad_norm": 19.687318801879883, + "learning_rate": 2.6666666666666667e-07, + "loss": 0.3625, + "step": 376 + }, + { + "epoch": 58.0, + "grad_norm": 11.662421226501465, + "learning_rate": 2.6547619047619045e-07, + "loss": 0.3469, + "step": 377 + }, + { + "epoch": 58.0, + "eval_accuracy": 0.8717948717948718, + "eval_loss": 0.4272923469543457, + "eval_runtime": 1.1242, + "eval_samples_per_second": 242.848, + "eval_steps_per_second": 4.448, + "step": 377 + }, + { + "epoch": 58.15384615384615, + "grad_norm": 8.805383682250977, + "learning_rate": 2.642857142857143e-07, + "loss": 0.3987, + "step": 378 + }, + { + "epoch": 58.30769230769231, + "grad_norm": 11.661012649536133, + "learning_rate": 2.630952380952381e-07, + "loss": 0.2147, + "step": 379 + }, + { + "epoch": 58.46153846153846, + "grad_norm": 12.969446182250977, + "learning_rate": 2.619047619047619e-07, + "loss": 0.3074, + "step": 380 + }, + { + "epoch": 58.61538461538461, + "grad_norm": 9.435002326965332, + "learning_rate": 2.607142857142857e-07, + "loss": 0.2626, + "step": 381 + }, + { + "epoch": 58.76923076923077, + "grad_norm": 14.217181205749512, + "learning_rate": 2.5952380952380955e-07, + "loss": 0.2264, + "step": 382 + }, + { + "epoch": 58.92307692307692, + "grad_norm": 13.90890884399414, + "learning_rate": 2.5833333333333333e-07, + "loss": 0.3528, + "step": 383 + }, + { + "epoch": 58.92307692307692, + "eval_accuracy": 0.8717948717948718, + "eval_loss": 0.4268935024738312, + "eval_runtime": 1.1071, + "eval_samples_per_second": 246.586, + "eval_steps_per_second": 4.516, + "step": 383 + }, + { + "epoch": 59.07692307692308, + "grad_norm": 6.627048492431641, + "learning_rate": 2.571428571428571e-07, + "loss": 0.2552, + "step": 384 + }, + { + "epoch": 59.23076923076923, + "grad_norm": 14.275843620300293, + "learning_rate": 2.5595238095238094e-07, + "loss": 0.2876, + "step": 385 + }, + { + "epoch": 59.38461538461539, + "grad_norm": 15.865604400634766, + "learning_rate": 2.547619047619047e-07, + "loss": 0.3701, + "step": 386 + }, + { + "epoch": 59.53846153846154, + "grad_norm": 12.051728248596191, + "learning_rate": 2.5357142857142855e-07, + "loss": 0.2598, + "step": 387 + }, + { + "epoch": 59.69230769230769, + "grad_norm": 11.886255264282227, + "learning_rate": 2.523809523809524e-07, + "loss": 0.238, + "step": 388 + }, + { + "epoch": 59.84615384615385, + "grad_norm": 12.55905818939209, + "learning_rate": 2.5119047619047616e-07, + "loss": 0.2893, + "step": 389 + }, + { + "epoch": 60.0, + "grad_norm": 18.840225219726562, + "learning_rate": 2.5e-07, + "loss": 0.2552, + "step": 390 + }, + { + "epoch": 60.0, + "eval_accuracy": 0.8681318681318682, + "eval_loss": 0.4324240982532501, + "eval_runtime": 1.1584, + "eval_samples_per_second": 235.676, + "eval_steps_per_second": 4.316, + "step": 390 + }, + { + "epoch": 60.15384615384615, + "grad_norm": 7.739254474639893, + "learning_rate": 2.488095238095238e-07, + "loss": 0.3825, + "step": 391 + }, + { + "epoch": 60.30769230769231, + "grad_norm": 27.177335739135742, + "learning_rate": 2.476190476190476e-07, + "loss": 0.3577, + "step": 392 + }, + { + "epoch": 60.46153846153846, + "grad_norm": 8.35522747039795, + "learning_rate": 2.4642857142857143e-07, + "loss": 0.296, + "step": 393 + }, + { + "epoch": 60.61538461538461, + "grad_norm": 12.505022048950195, + "learning_rate": 2.452380952380952e-07, + "loss": 0.2922, + "step": 394 + }, + { + "epoch": 60.76923076923077, + "grad_norm": 8.860665321350098, + "learning_rate": 2.4404761904761904e-07, + "loss": 0.3924, + "step": 395 + }, + { + "epoch": 60.92307692307692, + "grad_norm": 13.714400291442871, + "learning_rate": 2.4285714285714287e-07, + "loss": 0.239, + "step": 396 + }, + { + "epoch": 60.92307692307692, + "eval_accuracy": 0.8644688644688645, + "eval_loss": 0.4319455325603485, + "eval_runtime": 1.158, + "eval_samples_per_second": 235.758, + "eval_steps_per_second": 4.318, + "step": 396 + }, + { + "epoch": 61.07692307692308, + "grad_norm": 11.82459831237793, + "learning_rate": 2.4166666666666665e-07, + "loss": 0.2555, + "step": 397 + }, + { + "epoch": 61.23076923076923, + "grad_norm": 13.206809997558594, + "learning_rate": 2.404761904761905e-07, + "loss": 0.3353, + "step": 398 + }, + { + "epoch": 61.38461538461539, + "grad_norm": 11.129719734191895, + "learning_rate": 2.392857142857143e-07, + "loss": 0.313, + "step": 399 + }, + { + "epoch": 61.53846153846154, + "grad_norm": 19.73814582824707, + "learning_rate": 2.3809523809523806e-07, + "loss": 0.2591, + "step": 400 + }, + { + "epoch": 61.69230769230769, + "grad_norm": 10.5856351852417, + "learning_rate": 2.369047619047619e-07, + "loss": 0.3416, + "step": 401 + }, + { + "epoch": 61.84615384615385, + "grad_norm": 12.684653282165527, + "learning_rate": 2.357142857142857e-07, + "loss": 0.2315, + "step": 402 + }, + { + "epoch": 62.0, + "grad_norm": 8.558398246765137, + "learning_rate": 2.345238095238095e-07, + "loss": 0.3321, + "step": 403 + }, + { + "epoch": 62.0, + "eval_accuracy": 0.8717948717948718, + "eval_loss": 0.42702218890190125, + "eval_runtime": 1.1757, + "eval_samples_per_second": 232.2, + "eval_steps_per_second": 4.253, + "step": 403 + }, + { + "epoch": 62.15384615384615, + "grad_norm": 9.598026275634766, + "learning_rate": 2.3333333333333333e-07, + "loss": 0.3119, + "step": 404 + }, + { + "epoch": 62.30769230769231, + "grad_norm": 13.107952117919922, + "learning_rate": 2.3214285714285714e-07, + "loss": 0.3379, + "step": 405 + }, + { + "epoch": 62.46153846153846, + "grad_norm": 18.639419555664062, + "learning_rate": 2.3095238095238097e-07, + "loss": 0.2689, + "step": 406 + }, + { + "epoch": 62.61538461538461, + "grad_norm": 17.175498962402344, + "learning_rate": 2.2976190476190475e-07, + "loss": 0.3154, + "step": 407 + }, + { + "epoch": 62.76923076923077, + "grad_norm": 11.377558708190918, + "learning_rate": 2.2857142857142855e-07, + "loss": 0.2969, + "step": 408 + }, + { + "epoch": 62.92307692307692, + "grad_norm": 12.16077709197998, + "learning_rate": 2.2738095238095238e-07, + "loss": 0.3115, + "step": 409 + }, + { + "epoch": 62.92307692307692, + "eval_accuracy": 0.8717948717948718, + "eval_loss": 0.41838309168815613, + "eval_runtime": 1.133, + "eval_samples_per_second": 240.951, + "eval_steps_per_second": 4.413, + "step": 409 + }, + { + "epoch": 63.07692307692308, + "grad_norm": 29.19352912902832, + "learning_rate": 2.2619047619047619e-07, + "loss": 0.327, + "step": 410 + }, + { + "epoch": 63.23076923076923, + "grad_norm": 21.762849807739258, + "learning_rate": 2.25e-07, + "loss": 0.3142, + "step": 411 + }, + { + "epoch": 63.38461538461539, + "grad_norm": 20.668453216552734, + "learning_rate": 2.2380952380952382e-07, + "loss": 0.2705, + "step": 412 + }, + { + "epoch": 63.53846153846154, + "grad_norm": 10.485206604003906, + "learning_rate": 2.226190476190476e-07, + "loss": 0.2635, + "step": 413 + }, + { + "epoch": 63.69230769230769, + "grad_norm": 5.819842338562012, + "learning_rate": 2.214285714285714e-07, + "loss": 0.281, + "step": 414 + }, + { + "epoch": 63.84615384615385, + "grad_norm": 8.578193664550781, + "learning_rate": 2.2023809523809523e-07, + "loss": 0.2981, + "step": 415 + }, + { + "epoch": 64.0, + "grad_norm": 14.02076244354248, + "learning_rate": 2.1904761904761904e-07, + "loss": 0.306, + "step": 416 + }, + { + "epoch": 64.0, + "eval_accuracy": 0.8717948717948718, + "eval_loss": 0.4168645739555359, + "eval_runtime": 1.1838, + "eval_samples_per_second": 230.615, + "eval_steps_per_second": 4.224, + "step": 416 + }, + { + "epoch": 64.15384615384616, + "grad_norm": 8.580323219299316, + "learning_rate": 2.1785714285714284e-07, + "loss": 0.2481, + "step": 417 + }, + { + "epoch": 64.3076923076923, + "grad_norm": 12.701449394226074, + "learning_rate": 2.1666666666666667e-07, + "loss": 0.2978, + "step": 418 + }, + { + "epoch": 64.46153846153847, + "grad_norm": 7.2950544357299805, + "learning_rate": 2.1547619047619048e-07, + "loss": 0.2778, + "step": 419 + }, + { + "epoch": 64.61538461538461, + "grad_norm": 14.55117416381836, + "learning_rate": 2.1428571428571426e-07, + "loss": 0.3589, + "step": 420 + }, + { + "epoch": 64.76923076923077, + "grad_norm": 15.672528266906738, + "learning_rate": 2.130952380952381e-07, + "loss": 0.3638, + "step": 421 + }, + { + "epoch": 64.92307692307692, + "grad_norm": 8.848112106323242, + "learning_rate": 2.119047619047619e-07, + "loss": 0.3086, + "step": 422 + }, + { + "epoch": 64.92307692307692, + "eval_accuracy": 0.8717948717948718, + "eval_loss": 0.41758179664611816, + "eval_runtime": 1.1479, + "eval_samples_per_second": 237.817, + "eval_steps_per_second": 4.356, + "step": 422 + }, + { + "epoch": 65.07692307692308, + "grad_norm": 20.250051498413086, + "learning_rate": 2.107142857142857e-07, + "loss": 0.2965, + "step": 423 + }, + { + "epoch": 65.23076923076923, + "grad_norm": 8.315861701965332, + "learning_rate": 2.0952380952380953e-07, + "loss": 0.3119, + "step": 424 + }, + { + "epoch": 65.38461538461539, + "grad_norm": 7.086258888244629, + "learning_rate": 2.0833333333333333e-07, + "loss": 0.2904, + "step": 425 + }, + { + "epoch": 65.53846153846153, + "grad_norm": 9.35214900970459, + "learning_rate": 2.0714285714285714e-07, + "loss": 0.2787, + "step": 426 + }, + { + "epoch": 65.6923076923077, + "grad_norm": 11.061731338500977, + "learning_rate": 2.0595238095238094e-07, + "loss": 0.287, + "step": 427 + }, + { + "epoch": 65.84615384615384, + "grad_norm": 17.736583709716797, + "learning_rate": 2.0476190476190475e-07, + "loss": 0.3101, + "step": 428 + }, + { + "epoch": 66.0, + "grad_norm": 20.51115608215332, + "learning_rate": 2.0357142857142855e-07, + "loss": 0.4256, + "step": 429 + }, + { + "epoch": 66.0, + "eval_accuracy": 0.8717948717948718, + "eval_loss": 0.4195899963378906, + "eval_runtime": 1.2137, + "eval_samples_per_second": 224.925, + "eval_steps_per_second": 4.119, + "step": 429 + }, + { + "epoch": 66.15384615384616, + "grad_norm": 7.543363094329834, + "learning_rate": 2.0238095238095238e-07, + "loss": 0.1944, + "step": 430 + }, + { + "epoch": 66.3076923076923, + "grad_norm": 15.504809379577637, + "learning_rate": 2.0119047619047619e-07, + "loss": 0.2767, + "step": 431 + }, + { + "epoch": 66.46153846153847, + "grad_norm": 12.454192161560059, + "learning_rate": 2e-07, + "loss": 0.2697, + "step": 432 + }, + { + "epoch": 66.61538461538461, + "grad_norm": 11.19575023651123, + "learning_rate": 1.988095238095238e-07, + "loss": 0.2626, + "step": 433 + }, + { + "epoch": 66.76923076923077, + "grad_norm": 8.080245971679688, + "learning_rate": 1.976190476190476e-07, + "loss": 0.3262, + "step": 434 + }, + { + "epoch": 66.92307692307692, + "grad_norm": 16.002864837646484, + "learning_rate": 1.964285714285714e-07, + "loss": 0.2798, + "step": 435 + }, + { + "epoch": 66.92307692307692, + "eval_accuracy": 0.8717948717948718, + "eval_loss": 0.4219285249710083, + "eval_runtime": 1.1309, + "eval_samples_per_second": 241.401, + "eval_steps_per_second": 4.421, + "step": 435 + }, + { + "epoch": 67.07692307692308, + "grad_norm": 9.127429008483887, + "learning_rate": 1.9523809523809524e-07, + "loss": 0.2828, + "step": 436 + }, + { + "epoch": 67.23076923076923, + "grad_norm": 9.812334060668945, + "learning_rate": 1.9404761904761904e-07, + "loss": 0.3028, + "step": 437 + }, + { + "epoch": 67.38461538461539, + "grad_norm": 16.504629135131836, + "learning_rate": 1.9285714285714284e-07, + "loss": 0.2382, + "step": 438 + }, + { + "epoch": 67.53846153846153, + "grad_norm": 15.225912094116211, + "learning_rate": 1.9166666666666668e-07, + "loss": 0.3387, + "step": 439 + }, + { + "epoch": 67.6923076923077, + "grad_norm": 32.0360221862793, + "learning_rate": 1.9047619047619045e-07, + "loss": 0.252, + "step": 440 + }, + { + "epoch": 67.84615384615384, + "grad_norm": 10.604074478149414, + "learning_rate": 1.8928571428571426e-07, + "loss": 0.3423, + "step": 441 + }, + { + "epoch": 68.0, + "grad_norm": 11.707740783691406, + "learning_rate": 1.880952380952381e-07, + "loss": 0.3016, + "step": 442 + }, + { + "epoch": 68.0, + "eval_accuracy": 0.8717948717948718, + "eval_loss": 0.4223931133747101, + "eval_runtime": 1.1901, + "eval_samples_per_second": 229.39, + "eval_steps_per_second": 4.201, + "step": 442 + }, + { + "epoch": 68.15384615384616, + "grad_norm": 15.720952987670898, + "learning_rate": 1.869047619047619e-07, + "loss": 0.2488, + "step": 443 + }, + { + "epoch": 68.3076923076923, + "grad_norm": 21.468849182128906, + "learning_rate": 1.8571428571428572e-07, + "loss": 0.3655, + "step": 444 + }, + { + "epoch": 68.46153846153847, + "grad_norm": 7.170112133026123, + "learning_rate": 1.8452380952380953e-07, + "loss": 0.2293, + "step": 445 + }, + { + "epoch": 68.61538461538461, + "grad_norm": 13.677178382873535, + "learning_rate": 1.833333333333333e-07, + "loss": 0.3512, + "step": 446 + }, + { + "epoch": 68.76923076923077, + "grad_norm": 15.128756523132324, + "learning_rate": 1.8214285714285714e-07, + "loss": 0.3291, + "step": 447 + }, + { + "epoch": 68.92307692307692, + "grad_norm": 9.91322135925293, + "learning_rate": 1.8095238095238094e-07, + "loss": 0.2791, + "step": 448 + }, + { + "epoch": 68.92307692307692, + "eval_accuracy": 0.8717948717948718, + "eval_loss": 0.42070242762565613, + "eval_runtime": 1.1329, + "eval_samples_per_second": 240.966, + "eval_steps_per_second": 4.413, + "step": 448 + }, + { + "epoch": 69.07692307692308, + "grad_norm": 18.428592681884766, + "learning_rate": 1.7976190476190475e-07, + "loss": 0.2857, + "step": 449 + }, + { + "epoch": 69.23076923076923, + "grad_norm": 13.563444137573242, + "learning_rate": 1.7857142857142858e-07, + "loss": 0.3579, + "step": 450 + }, + { + "epoch": 69.38461538461539, + "grad_norm": 7.071059226989746, + "learning_rate": 1.7738095238095238e-07, + "loss": 0.288, + "step": 451 + }, + { + "epoch": 69.53846153846153, + "grad_norm": 13.733859062194824, + "learning_rate": 1.761904761904762e-07, + "loss": 0.3107, + "step": 452 + }, + { + "epoch": 69.6923076923077, + "grad_norm": 24.475296020507812, + "learning_rate": 1.75e-07, + "loss": 0.3036, + "step": 453 + }, + { + "epoch": 69.84615384615384, + "grad_norm": 8.03947925567627, + "learning_rate": 1.738095238095238e-07, + "loss": 0.3046, + "step": 454 + }, + { + "epoch": 70.0, + "grad_norm": 12.644840240478516, + "learning_rate": 1.726190476190476e-07, + "loss": 0.2651, + "step": 455 + }, + { + "epoch": 70.0, + "eval_accuracy": 0.8717948717948718, + "eval_loss": 0.4188561737537384, + "eval_runtime": 1.1824, + "eval_samples_per_second": 230.892, + "eval_steps_per_second": 4.229, + "step": 455 + }, + { + "epoch": 70.15384615384616, + "grad_norm": 10.41529655456543, + "learning_rate": 1.7142857142857143e-07, + "loss": 0.2741, + "step": 456 + }, + { + "epoch": 70.3076923076923, + "grad_norm": 13.780343055725098, + "learning_rate": 1.7023809523809524e-07, + "loss": 0.1817, + "step": 457 + }, + { + "epoch": 70.46153846153847, + "grad_norm": 15.142984390258789, + "learning_rate": 1.6904761904761904e-07, + "loss": 0.2511, + "step": 458 + }, + { + "epoch": 70.61538461538461, + "grad_norm": 11.113754272460938, + "learning_rate": 1.6785714285714285e-07, + "loss": 0.3617, + "step": 459 + }, + { + "epoch": 70.76923076923077, + "grad_norm": 9.247007369995117, + "learning_rate": 1.6666666666666665e-07, + "loss": 0.3423, + "step": 460 + }, + { + "epoch": 70.92307692307692, + "grad_norm": 10.317591667175293, + "learning_rate": 1.6547619047619045e-07, + "loss": 0.2466, + "step": 461 + }, + { + "epoch": 70.92307692307692, + "eval_accuracy": 0.8717948717948718, + "eval_loss": 0.4177640378475189, + "eval_runtime": 1.1382, + "eval_samples_per_second": 239.846, + "eval_steps_per_second": 4.393, + "step": 461 + }, + { + "epoch": 71.07692307692308, + "grad_norm": 12.4044771194458, + "learning_rate": 1.6428571428571429e-07, + "loss": 0.274, + "step": 462 + }, + { + "epoch": 71.23076923076923, + "grad_norm": 12.682540893554688, + "learning_rate": 1.630952380952381e-07, + "loss": 0.3048, + "step": 463 + }, + { + "epoch": 71.38461538461539, + "grad_norm": 25.84153175354004, + "learning_rate": 1.619047619047619e-07, + "loss": 0.2463, + "step": 464 + }, + { + "epoch": 71.53846153846153, + "grad_norm": 13.235491752624512, + "learning_rate": 1.6071428571428573e-07, + "loss": 0.415, + "step": 465 + }, + { + "epoch": 71.6923076923077, + "grad_norm": 6.873939514160156, + "learning_rate": 1.595238095238095e-07, + "loss": 0.3163, + "step": 466 + }, + { + "epoch": 71.84615384615384, + "grad_norm": 16.569108963012695, + "learning_rate": 1.583333333333333e-07, + "loss": 0.3067, + "step": 467 + }, + { + "epoch": 72.0, + "grad_norm": 18.71702003479004, + "learning_rate": 1.5714285714285714e-07, + "loss": 0.1913, + "step": 468 + }, + { + "epoch": 72.0, + "eval_accuracy": 0.8717948717948718, + "eval_loss": 0.4177253544330597, + "eval_runtime": 1.1798, + "eval_samples_per_second": 231.388, + "eval_steps_per_second": 4.238, + "step": 468 + }, + { + "epoch": 72.15384615384616, + "grad_norm": 21.70823097229004, + "learning_rate": 1.5595238095238094e-07, + "loss": 0.3477, + "step": 469 + }, + { + "epoch": 72.3076923076923, + "grad_norm": 18.4373779296875, + "learning_rate": 1.5476190476190475e-07, + "loss": 0.3226, + "step": 470 + }, + { + "epoch": 72.46153846153847, + "grad_norm": 12.795662879943848, + "learning_rate": 1.5357142857142858e-07, + "loss": 0.36, + "step": 471 + }, + { + "epoch": 72.61538461538461, + "grad_norm": 6.41522741317749, + "learning_rate": 1.5238095238095238e-07, + "loss": 0.2615, + "step": 472 + }, + { + "epoch": 72.76923076923077, + "grad_norm": 11.777397155761719, + "learning_rate": 1.5119047619047616e-07, + "loss": 0.2648, + "step": 473 + }, + { + "epoch": 72.92307692307692, + "grad_norm": 14.508996963500977, + "learning_rate": 1.5e-07, + "loss": 0.2719, + "step": 474 + }, + { + "epoch": 72.92307692307692, + "eval_accuracy": 0.8717948717948718, + "eval_loss": 0.4163550138473511, + "eval_runtime": 1.1181, + "eval_samples_per_second": 244.158, + "eval_steps_per_second": 4.472, + "step": 474 + }, + { + "epoch": 73.07692307692308, + "grad_norm": 15.088624000549316, + "learning_rate": 1.488095238095238e-07, + "loss": 0.3153, + "step": 475 + }, + { + "epoch": 73.23076923076923, + "grad_norm": 6.610665798187256, + "learning_rate": 1.476190476190476e-07, + "loss": 0.2292, + "step": 476 + }, + { + "epoch": 73.38461538461539, + "grad_norm": 19.090049743652344, + "learning_rate": 1.4642857142857143e-07, + "loss": 0.2765, + "step": 477 + }, + { + "epoch": 73.53846153846153, + "grad_norm": 19.875932693481445, + "learning_rate": 1.4523809523809524e-07, + "loss": 0.2797, + "step": 478 + }, + { + "epoch": 73.6923076923077, + "grad_norm": 21.46002960205078, + "learning_rate": 1.4404761904761902e-07, + "loss": 0.2846, + "step": 479 + }, + { + "epoch": 73.84615384615384, + "grad_norm": 9.745357513427734, + "learning_rate": 1.4285714285714285e-07, + "loss": 0.3138, + "step": 480 + }, + { + "epoch": 74.0, + "grad_norm": 9.963276863098145, + "learning_rate": 1.4166666666666665e-07, + "loss": 0.3364, + "step": 481 + }, + { + "epoch": 74.0, + "eval_accuracy": 0.8717948717948718, + "eval_loss": 0.41662341356277466, + "eval_runtime": 1.1761, + "eval_samples_per_second": 232.123, + "eval_steps_per_second": 4.251, + "step": 481 + }, + { + "epoch": 74.15384615384616, + "grad_norm": 12.587307929992676, + "learning_rate": 1.4047619047619048e-07, + "loss": 0.3476, + "step": 482 + }, + { + "epoch": 74.3076923076923, + "grad_norm": 15.544804573059082, + "learning_rate": 1.392857142857143e-07, + "loss": 0.298, + "step": 483 + }, + { + "epoch": 74.46153846153847, + "grad_norm": 16.263813018798828, + "learning_rate": 1.380952380952381e-07, + "loss": 0.3103, + "step": 484 + }, + { + "epoch": 74.61538461538461, + "grad_norm": 15.350561141967773, + "learning_rate": 1.3690476190476192e-07, + "loss": 0.3083, + "step": 485 + }, + { + "epoch": 74.76923076923077, + "grad_norm": 18.922351837158203, + "learning_rate": 1.357142857142857e-07, + "loss": 0.2371, + "step": 486 + }, + { + "epoch": 74.92307692307692, + "grad_norm": 10.000033378601074, + "learning_rate": 1.345238095238095e-07, + "loss": 0.283, + "step": 487 + }, + { + "epoch": 74.92307692307692, + "eval_accuracy": 0.8754578754578755, + "eval_loss": 0.4179239273071289, + "eval_runtime": 1.1425, + "eval_samples_per_second": 238.952, + "eval_steps_per_second": 4.376, + "step": 487 + }, + { + "epoch": 75.07692307692308, + "grad_norm": 11.999493598937988, + "learning_rate": 1.3333333333333334e-07, + "loss": 0.2356, + "step": 488 + }, + { + "epoch": 75.23076923076923, + "grad_norm": 15.124945640563965, + "learning_rate": 1.3214285714285714e-07, + "loss": 0.2473, + "step": 489 + }, + { + "epoch": 75.38461538461539, + "grad_norm": 5.885743618011475, + "learning_rate": 1.3095238095238095e-07, + "loss": 0.2623, + "step": 490 + }, + { + "epoch": 75.53846153846153, + "grad_norm": 17.72136116027832, + "learning_rate": 1.2976190476190478e-07, + "loss": 0.2667, + "step": 491 + }, + { + "epoch": 75.6923076923077, + "grad_norm": 17.649593353271484, + "learning_rate": 1.2857142857142855e-07, + "loss": 0.3016, + "step": 492 + }, + { + "epoch": 75.84615384615384, + "grad_norm": 15.614336013793945, + "learning_rate": 1.2738095238095236e-07, + "loss": 0.2771, + "step": 493 + }, + { + "epoch": 76.0, + "grad_norm": 6.446508884429932, + "learning_rate": 1.261904761904762e-07, + "loss": 0.2891, + "step": 494 + }, + { + "epoch": 76.0, + "eval_accuracy": 0.8754578754578755, + "eval_loss": 0.4174346923828125, + "eval_runtime": 1.1254, + "eval_samples_per_second": 242.591, + "eval_steps_per_second": 4.443, + "step": 494 + }, + { + "epoch": 76.15384615384616, + "grad_norm": 8.419719696044922, + "learning_rate": 1.25e-07, + "loss": 0.2639, + "step": 495 + }, + { + "epoch": 76.3076923076923, + "grad_norm": 8.500795364379883, + "learning_rate": 1.238095238095238e-07, + "loss": 0.2902, + "step": 496 + }, + { + "epoch": 76.46153846153847, + "grad_norm": 9.533052444458008, + "learning_rate": 1.226190476190476e-07, + "loss": 0.3342, + "step": 497 + }, + { + "epoch": 76.61538461538461, + "grad_norm": 32.33898162841797, + "learning_rate": 1.2142857142857143e-07, + "loss": 0.3059, + "step": 498 + }, + { + "epoch": 76.76923076923077, + "grad_norm": 14.258476257324219, + "learning_rate": 1.2023809523809524e-07, + "loss": 0.249, + "step": 499 + }, + { + "epoch": 76.92307692307692, + "grad_norm": 28.383731842041016, + "learning_rate": 1.1904761904761903e-07, + "loss": 0.2625, + "step": 500 + }, + { + "epoch": 76.92307692307692, + "eval_accuracy": 0.8754578754578755, + "eval_loss": 0.4179657995700836, + "eval_runtime": 1.0921, + "eval_samples_per_second": 249.988, + "eval_steps_per_second": 4.579, + "step": 500 + }, + { + "epoch": 77.07692307692308, + "grad_norm": 14.453038215637207, + "learning_rate": 1.1785714285714285e-07, + "loss": 0.3501, + "step": 501 + }, + { + "epoch": 77.23076923076923, + "grad_norm": 15.141488075256348, + "learning_rate": 1.1666666666666667e-07, + "loss": 0.2959, + "step": 502 + }, + { + "epoch": 77.38461538461539, + "grad_norm": 27.595386505126953, + "learning_rate": 1.1547619047619048e-07, + "loss": 0.3883, + "step": 503 + }, + { + "epoch": 77.53846153846153, + "grad_norm": 7.041699409484863, + "learning_rate": 1.1428571428571427e-07, + "loss": 0.2567, + "step": 504 + }, + { + "epoch": 77.6923076923077, + "grad_norm": 5.878727912902832, + "learning_rate": 1.1309523809523809e-07, + "loss": 0.1855, + "step": 505 + }, + { + "epoch": 77.84615384615384, + "grad_norm": 9.228104591369629, + "learning_rate": 1.1190476190476191e-07, + "loss": 0.2607, + "step": 506 + }, + { + "epoch": 78.0, + "grad_norm": 10.124485969543457, + "learning_rate": 1.107142857142857e-07, + "loss": 0.2843, + "step": 507 + }, + { + "epoch": 78.0, + "eval_accuracy": 0.8717948717948718, + "eval_loss": 0.4184325635433197, + "eval_runtime": 1.2364, + "eval_samples_per_second": 220.81, + "eval_steps_per_second": 4.044, + "step": 507 + }, + { + "epoch": 78.15384615384616, + "grad_norm": 32.02544403076172, + "learning_rate": 1.0952380952380952e-07, + "loss": 0.2707, + "step": 508 + }, + { + "epoch": 78.3076923076923, + "grad_norm": 8.309106826782227, + "learning_rate": 1.0833333333333334e-07, + "loss": 0.3501, + "step": 509 + }, + { + "epoch": 78.46153846153847, + "grad_norm": 10.85938549041748, + "learning_rate": 1.0714285714285713e-07, + "loss": 0.3379, + "step": 510 + }, + { + "epoch": 78.61538461538461, + "grad_norm": 14.523594856262207, + "learning_rate": 1.0595238095238095e-07, + "loss": 0.2191, + "step": 511 + }, + { + "epoch": 78.76923076923077, + "grad_norm": 16.100353240966797, + "learning_rate": 1.0476190476190476e-07, + "loss": 0.3082, + "step": 512 + }, + { + "epoch": 78.92307692307692, + "grad_norm": 10.894936561584473, + "learning_rate": 1.0357142857142857e-07, + "loss": 0.375, + "step": 513 + }, + { + "epoch": 78.92307692307692, + "eval_accuracy": 0.8754578754578755, + "eval_loss": 0.41671693325042725, + "eval_runtime": 1.116, + "eval_samples_per_second": 244.622, + "eval_steps_per_second": 4.48, + "step": 513 + }, + { + "epoch": 79.07692307692308, + "grad_norm": 10.38732624053955, + "learning_rate": 1.0238095238095237e-07, + "loss": 0.2812, + "step": 514 + }, + { + "epoch": 79.23076923076923, + "grad_norm": 15.733484268188477, + "learning_rate": 1.0119047619047619e-07, + "loss": 0.3247, + "step": 515 + }, + { + "epoch": 79.38461538461539, + "grad_norm": 6.772809028625488, + "learning_rate": 1e-07, + "loss": 0.221, + "step": 516 + }, + { + "epoch": 79.53846153846153, + "grad_norm": 8.480406761169434, + "learning_rate": 9.88095238095238e-08, + "loss": 0.2817, + "step": 517 + }, + { + "epoch": 79.6923076923077, + "grad_norm": 8.911038398742676, + "learning_rate": 9.761904761904762e-08, + "loss": 0.3514, + "step": 518 + }, + { + "epoch": 79.84615384615384, + "grad_norm": 18.952770233154297, + "learning_rate": 9.642857142857142e-08, + "loss": 0.2881, + "step": 519 + }, + { + "epoch": 80.0, + "grad_norm": 6.310261249542236, + "learning_rate": 9.523809523809523e-08, + "loss": 0.3107, + "step": 520 + }, + { + "epoch": 80.0, + "eval_accuracy": 0.8754578754578755, + "eval_loss": 0.41499239206314087, + "eval_runtime": 1.191, + "eval_samples_per_second": 229.225, + "eval_steps_per_second": 4.198, + "step": 520 + }, + { + "epoch": 80.15384615384616, + "grad_norm": 17.95909309387207, + "learning_rate": 9.404761904761904e-08, + "loss": 0.2551, + "step": 521 + }, + { + "epoch": 80.3076923076923, + "grad_norm": 27.223407745361328, + "learning_rate": 9.285714285714286e-08, + "loss": 0.3356, + "step": 522 + }, + { + "epoch": 80.46153846153847, + "grad_norm": 6.595218658447266, + "learning_rate": 9.166666666666665e-08, + "loss": 0.2383, + "step": 523 + }, + { + "epoch": 80.61538461538461, + "grad_norm": 20.459001541137695, + "learning_rate": 9.047619047619047e-08, + "loss": 0.3265, + "step": 524 + }, + { + "epoch": 80.76923076923077, + "grad_norm": 15.349759101867676, + "learning_rate": 8.928571428571429e-08, + "loss": 0.2763, + "step": 525 + }, + { + "epoch": 80.92307692307692, + "grad_norm": 10.789344787597656, + "learning_rate": 8.80952380952381e-08, + "loss": 0.3742, + "step": 526 + }, + { + "epoch": 80.92307692307692, + "eval_accuracy": 0.8717948717948718, + "eval_loss": 0.4144986569881439, + "eval_runtime": 1.1418, + "eval_samples_per_second": 239.087, + "eval_steps_per_second": 4.379, + "step": 526 + }, + { + "epoch": 81.07692307692308, + "grad_norm": 13.075380325317383, + "learning_rate": 8.69047619047619e-08, + "loss": 0.2957, + "step": 527 + }, + { + "epoch": 81.23076923076923, + "grad_norm": 10.450531959533691, + "learning_rate": 8.571428571428572e-08, + "loss": 0.3245, + "step": 528 + }, + { + "epoch": 81.38461538461539, + "grad_norm": 20.416603088378906, + "learning_rate": 8.452380952380952e-08, + "loss": 0.2355, + "step": 529 + }, + { + "epoch": 81.53846153846153, + "grad_norm": 11.954909324645996, + "learning_rate": 8.333333333333333e-08, + "loss": 0.2847, + "step": 530 + }, + { + "epoch": 81.6923076923077, + "grad_norm": 10.024072647094727, + "learning_rate": 8.214285714285714e-08, + "loss": 0.3908, + "step": 531 + }, + { + "epoch": 81.84615384615384, + "grad_norm": 13.764399528503418, + "learning_rate": 8.095238095238095e-08, + "loss": 0.2062, + "step": 532 + }, + { + "epoch": 82.0, + "grad_norm": 18.268247604370117, + "learning_rate": 7.976190476190475e-08, + "loss": 0.2574, + "step": 533 + }, + { + "epoch": 82.0, + "eval_accuracy": 0.8754578754578755, + "eval_loss": 0.4145357608795166, + "eval_runtime": 1.2255, + "eval_samples_per_second": 222.772, + "eval_steps_per_second": 4.08, + "step": 533 + }, + { + "epoch": 82.15384615384616, + "grad_norm": 11.204299926757812, + "learning_rate": 7.857142857142857e-08, + "loss": 0.294, + "step": 534 + }, + { + "epoch": 82.3076923076923, + "grad_norm": 12.602919578552246, + "learning_rate": 7.738095238095237e-08, + "loss": 0.3379, + "step": 535 + }, + { + "epoch": 82.46153846153847, + "grad_norm": 17.001785278320312, + "learning_rate": 7.619047619047619e-08, + "loss": 0.2501, + "step": 536 + }, + { + "epoch": 82.61538461538461, + "grad_norm": 10.472540855407715, + "learning_rate": 7.5e-08, + "loss": 0.2673, + "step": 537 + }, + { + "epoch": 82.76923076923077, + "grad_norm": 12.93094539642334, + "learning_rate": 7.38095238095238e-08, + "loss": 0.3303, + "step": 538 + }, + { + "epoch": 82.92307692307692, + "grad_norm": 15.572615623474121, + "learning_rate": 7.261904761904762e-08, + "loss": 0.329, + "step": 539 + }, + { + "epoch": 82.92307692307692, + "eval_accuracy": 0.8754578754578755, + "eval_loss": 0.41488900780677795, + "eval_runtime": 1.1463, + "eval_samples_per_second": 238.152, + "eval_steps_per_second": 4.362, + "step": 539 + }, + { + "epoch": 83.07692307692308, + "grad_norm": 8.833599090576172, + "learning_rate": 7.142857142857142e-08, + "loss": 0.2862, + "step": 540 + }, + { + "epoch": 83.23076923076923, + "grad_norm": 7.227090358734131, + "learning_rate": 7.023809523809524e-08, + "loss": 0.2553, + "step": 541 + }, + { + "epoch": 83.38461538461539, + "grad_norm": 16.44085693359375, + "learning_rate": 6.904761904761905e-08, + "loss": 0.3129, + "step": 542 + }, + { + "epoch": 83.53846153846153, + "grad_norm": 13.633960723876953, + "learning_rate": 6.785714285714285e-08, + "loss": 0.3134, + "step": 543 + }, + { + "epoch": 83.6923076923077, + "grad_norm": 16.555570602416992, + "learning_rate": 6.666666666666667e-08, + "loss": 0.2504, + "step": 544 + }, + { + "epoch": 83.84615384615384, + "grad_norm": 7.340324878692627, + "learning_rate": 6.547619047619047e-08, + "loss": 0.2966, + "step": 545 + }, + { + "epoch": 84.0, + "grad_norm": 10.442778587341309, + "learning_rate": 6.428571428571428e-08, + "loss": 0.2727, + "step": 546 + }, + { + "epoch": 84.0, + "eval_accuracy": 0.8754578754578755, + "eval_loss": 0.4145146608352661, + "eval_runtime": 1.1279, + "eval_samples_per_second": 242.042, + "eval_steps_per_second": 4.433, + "step": 546 + }, + { + "epoch": 84.15384615384616, + "grad_norm": 9.072919845581055, + "learning_rate": 6.30952380952381e-08, + "loss": 0.2461, + "step": 547 + }, + { + "epoch": 84.3076923076923, + "grad_norm": 8.624760627746582, + "learning_rate": 6.19047619047619e-08, + "loss": 0.2812, + "step": 548 + }, + { + "epoch": 84.46153846153847, + "grad_norm": 8.95349407196045, + "learning_rate": 6.071428571428572e-08, + "loss": 0.2835, + "step": 549 + }, + { + "epoch": 84.61538461538461, + "grad_norm": 18.060441970825195, + "learning_rate": 5.9523809523809515e-08, + "loss": 0.2697, + "step": 550 + }, + { + "epoch": 84.76923076923077, + "grad_norm": 15.820292472839355, + "learning_rate": 5.833333333333333e-08, + "loss": 0.3266, + "step": 551 + }, + { + "epoch": 84.92307692307692, + "grad_norm": 10.503725051879883, + "learning_rate": 5.714285714285714e-08, + "loss": 0.2977, + "step": 552 + }, + { + "epoch": 84.92307692307692, + "eval_accuracy": 0.8754578754578755, + "eval_loss": 0.41494670510292053, + "eval_runtime": 1.1373, + "eval_samples_per_second": 240.052, + "eval_steps_per_second": 4.397, + "step": 552 + }, + { + "epoch": 85.07692307692308, + "grad_norm": 9.66882610321045, + "learning_rate": 5.5952380952380955e-08, + "loss": 0.3452, + "step": 553 + }, + { + "epoch": 85.23076923076923, + "grad_norm": 9.360061645507812, + "learning_rate": 5.476190476190476e-08, + "loss": 0.3033, + "step": 554 + }, + { + "epoch": 85.38461538461539, + "grad_norm": 10.878594398498535, + "learning_rate": 5.3571428571428564e-08, + "loss": 0.323, + "step": 555 + }, + { + "epoch": 85.53846153846153, + "grad_norm": 9.788655281066895, + "learning_rate": 5.238095238095238e-08, + "loss": 0.2796, + "step": 556 + }, + { + "epoch": 85.6923076923077, + "grad_norm": 11.249568939208984, + "learning_rate": 5.1190476190476187e-08, + "loss": 0.2381, + "step": 557 + }, + { + "epoch": 85.84615384615384, + "grad_norm": 22.544105529785156, + "learning_rate": 5e-08, + "loss": 0.3593, + "step": 558 + }, + { + "epoch": 86.0, + "grad_norm": 9.197429656982422, + "learning_rate": 4.880952380952381e-08, + "loss": 0.2611, + "step": 559 + }, + { + "epoch": 86.0, + "eval_accuracy": 0.8717948717948718, + "eval_loss": 0.4160268008708954, + "eval_runtime": 1.1319, + "eval_samples_per_second": 241.183, + "eval_steps_per_second": 4.417, + "step": 559 + }, + { + "epoch": 86.15384615384616, + "grad_norm": 10.480982780456543, + "learning_rate": 4.7619047619047613e-08, + "loss": 0.2919, + "step": 560 + }, + { + "epoch": 86.3076923076923, + "grad_norm": 15.81564712524414, + "learning_rate": 4.642857142857143e-08, + "loss": 0.3395, + "step": 561 + }, + { + "epoch": 86.46153846153847, + "grad_norm": 25.986629486083984, + "learning_rate": 4.5238095238095236e-08, + "loss": 0.3436, + "step": 562 + }, + { + "epoch": 86.61538461538461, + "grad_norm": 20.077136993408203, + "learning_rate": 4.404761904761905e-08, + "loss": 0.363, + "step": 563 + }, + { + "epoch": 86.76923076923077, + "grad_norm": 16.67424774169922, + "learning_rate": 4.285714285714286e-08, + "loss": 0.2731, + "step": 564 + }, + { + "epoch": 86.92307692307692, + "grad_norm": 16.004140853881836, + "learning_rate": 4.166666666666666e-08, + "loss": 0.2542, + "step": 565 + }, + { + "epoch": 86.92307692307692, + "eval_accuracy": 0.8717948717948718, + "eval_loss": 0.4169901907444, + "eval_runtime": 1.1368, + "eval_samples_per_second": 240.144, + "eval_steps_per_second": 4.398, + "step": 565 + }, + { + "epoch": 87.07692307692308, + "grad_norm": 18.212156295776367, + "learning_rate": 4.0476190476190474e-08, + "loss": 0.3265, + "step": 566 + }, + { + "epoch": 87.23076923076923, + "grad_norm": 12.494462966918945, + "learning_rate": 3.9285714285714285e-08, + "loss": 0.3226, + "step": 567 + }, + { + "epoch": 87.38461538461539, + "grad_norm": 6.6637773513793945, + "learning_rate": 3.8095238095238096e-08, + "loss": 0.2262, + "step": 568 + }, + { + "epoch": 87.53846153846153, + "grad_norm": 12.791160583496094, + "learning_rate": 3.69047619047619e-08, + "loss": 0.3541, + "step": 569 + }, + { + "epoch": 87.6923076923077, + "grad_norm": 9.732351303100586, + "learning_rate": 3.571428571428571e-08, + "loss": 0.2551, + "step": 570 + }, + { + "epoch": 87.84615384615384, + "grad_norm": 21.216760635375977, + "learning_rate": 3.452380952380952e-08, + "loss": 0.2629, + "step": 571 + }, + { + "epoch": 88.0, + "grad_norm": 15.293636322021484, + "learning_rate": 3.3333333333333334e-08, + "loss": 0.2665, + "step": 572 + }, + { + "epoch": 88.0, + "eval_accuracy": 0.8717948717948718, + "eval_loss": 0.41707876324653625, + "eval_runtime": 1.1463, + "eval_samples_per_second": 238.151, + "eval_steps_per_second": 4.362, + "step": 572 + }, + { + "epoch": 88.15384615384616, + "grad_norm": 11.668240547180176, + "learning_rate": 3.214285714285714e-08, + "loss": 0.3315, + "step": 573 + }, + { + "epoch": 88.3076923076923, + "grad_norm": 28.14773178100586, + "learning_rate": 3.095238095238095e-08, + "loss": 0.3098, + "step": 574 + }, + { + "epoch": 88.46153846153847, + "grad_norm": 13.815803527832031, + "learning_rate": 2.9761904761904758e-08, + "loss": 0.3381, + "step": 575 + }, + { + "epoch": 88.61538461538461, + "grad_norm": 11.341737747192383, + "learning_rate": 2.857142857142857e-08, + "loss": 0.2995, + "step": 576 + }, + { + "epoch": 88.76923076923077, + "grad_norm": 9.191329002380371, + "learning_rate": 2.738095238095238e-08, + "loss": 0.2485, + "step": 577 + }, + { + "epoch": 88.92307692307692, + "grad_norm": 12.781599998474121, + "learning_rate": 2.619047619047619e-08, + "loss": 0.2654, + "step": 578 + }, + { + "epoch": 88.92307692307692, + "eval_accuracy": 0.8717948717948718, + "eval_loss": 0.4170469343662262, + "eval_runtime": 1.1144, + "eval_samples_per_second": 244.964, + "eval_steps_per_second": 4.487, + "step": 578 + }, + { + "epoch": 89.07692307692308, + "grad_norm": 18.813859939575195, + "learning_rate": 2.5e-08, + "loss": 0.4085, + "step": 579 + }, + { + "epoch": 89.23076923076923, + "grad_norm": 9.056866645812988, + "learning_rate": 2.3809523809523807e-08, + "loss": 0.2377, + "step": 580 + }, + { + "epoch": 89.38461538461539, + "grad_norm": 21.75194549560547, + "learning_rate": 2.2619047619047618e-08, + "loss": 0.2511, + "step": 581 + }, + { + "epoch": 89.53846153846153, + "grad_norm": 24.87982177734375, + "learning_rate": 2.142857142857143e-08, + "loss": 0.3468, + "step": 582 + }, + { + "epoch": 89.6923076923077, + "grad_norm": 13.90104866027832, + "learning_rate": 2.0238095238095237e-08, + "loss": 0.2001, + "step": 583 + }, + { + "epoch": 89.84615384615384, + "grad_norm": 13.514042854309082, + "learning_rate": 1.9047619047619048e-08, + "loss": 0.2673, + "step": 584 + }, + { + "epoch": 90.0, + "grad_norm": 17.871490478515625, + "learning_rate": 1.7857142857142856e-08, + "loss": 0.3059, + "step": 585 + }, + { + "epoch": 90.0, + "eval_accuracy": 0.8717948717948718, + "eval_loss": 0.4172230660915375, + "eval_runtime": 1.1156, + "eval_samples_per_second": 244.721, + "eval_steps_per_second": 4.482, + "step": 585 + }, + { + "epoch": 90.15384615384616, + "grad_norm": 9.703531265258789, + "learning_rate": 1.6666666666666667e-08, + "loss": 0.1731, + "step": 586 + }, + { + "epoch": 90.3076923076923, + "grad_norm": 13.011953353881836, + "learning_rate": 1.5476190476190475e-08, + "loss": 0.3008, + "step": 587 + }, + { + "epoch": 90.46153846153847, + "grad_norm": 13.331348419189453, + "learning_rate": 1.4285714285714284e-08, + "loss": 0.2402, + "step": 588 + }, + { + "epoch": 90.61538461538461, + "grad_norm": 9.919706344604492, + "learning_rate": 1.3095238095238096e-08, + "loss": 0.3376, + "step": 589 + }, + { + "epoch": 90.76923076923077, + "grad_norm": 20.184898376464844, + "learning_rate": 1.1904761904761903e-08, + "loss": 0.3106, + "step": 590 + }, + { + "epoch": 90.92307692307692, + "grad_norm": 6.9222588539123535, + "learning_rate": 1.0714285714285715e-08, + "loss": 0.2377, + "step": 591 + }, + { + "epoch": 90.92307692307692, + "eval_accuracy": 0.8717948717948718, + "eval_loss": 0.417271226644516, + "eval_runtime": 1.1559, + "eval_samples_per_second": 236.178, + "eval_steps_per_second": 4.326, + "step": 591 + }, + { + "epoch": 91.07692307692308, + "grad_norm": 10.644214630126953, + "learning_rate": 9.523809523809524e-09, + "loss": 0.2738, + "step": 592 + }, + { + "epoch": 91.23076923076923, + "grad_norm": 12.530488014221191, + "learning_rate": 8.333333333333334e-09, + "loss": 0.3082, + "step": 593 + }, + { + "epoch": 91.38461538461539, + "grad_norm": 10.068582534790039, + "learning_rate": 7.142857142857142e-09, + "loss": 0.2678, + "step": 594 + }, + { + "epoch": 91.53846153846153, + "grad_norm": 8.5696439743042, + "learning_rate": 5.952380952380952e-09, + "loss": 0.2902, + "step": 595 + }, + { + "epoch": 91.6923076923077, + "grad_norm": 22.662599563598633, + "learning_rate": 4.761904761904762e-09, + "loss": 0.2826, + "step": 596 + }, + { + "epoch": 91.84615384615384, + "grad_norm": 19.666378021240234, + "learning_rate": 3.571428571428571e-09, + "loss": 0.3109, + "step": 597 + }, + { + "epoch": 92.0, + "grad_norm": 15.434282302856445, + "learning_rate": 2.380952380952381e-09, + "loss": 0.2896, + "step": 598 + }, + { + "epoch": 92.0, + "eval_accuracy": 0.8717948717948718, + "eval_loss": 0.4172247350215912, + "eval_runtime": 1.1283, + "eval_samples_per_second": 241.962, + "eval_steps_per_second": 4.432, + "step": 598 + }, + { + "epoch": 92.15384615384616, + "grad_norm": 21.7716064453125, + "learning_rate": 1.1904761904761905e-09, + "loss": 0.3775, + "step": 599 + }, + { + "epoch": 92.3076923076923, + "grad_norm": 9.635214805603027, + "learning_rate": 0.0, + "loss": 0.3133, + "step": 600 + }, + { + "epoch": 92.3076923076923, + "eval_accuracy": 0.8717948717948718, + "eval_loss": 0.4172203838825226, + "eval_runtime": 1.1209, + "eval_samples_per_second": 243.565, + "eval_steps_per_second": 4.461, + "step": 600 + }, + { + "epoch": 92.3076923076923, + "step": 600, + "total_flos": 1.4722503891660472e+18, + "train_loss": 0.3897706772387028, + "train_runtime": 753.6703, + "train_samples_per_second": 108.137, + "train_steps_per_second": 0.796 + } + ], + "logging_steps": 1, + "max_steps": 600, + "num_input_tokens_seen": 0, + "num_train_epochs": 100, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.4722503891660472e+18, + "train_batch_size": 64, + "trial_name": null, + "trial_params": null +}