{ "best_metric": 0.8791208791208791, "best_model_checkpoint": "vit-msn-small-lateral_flow_ivalidation_train_test_6/checkpoint-318", "epoch": 92.3076923076923, "eval_steps": 500, "global_step": 600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.15384615384615385, "grad_norm": 8.29874038696289, "learning_rate": 2.7777777777777776e-09, "loss": 0.6813, "step": 1 }, { "epoch": 0.3076923076923077, "grad_norm": 14.01459789276123, "learning_rate": 5.555555555555555e-09, "loss": 0.6845, "step": 2 }, { "epoch": 0.46153846153846156, "grad_norm": 12.911795616149902, "learning_rate": 8.333333333333334e-09, "loss": 0.6836, "step": 3 }, { "epoch": 0.6153846153846154, "grad_norm": 6.274200439453125, "learning_rate": 1.111111111111111e-08, "loss": 0.6521, "step": 4 }, { "epoch": 0.7692307692307693, "grad_norm": 8.031621932983398, "learning_rate": 1.3888888888888887e-08, "loss": 0.6781, "step": 5 }, { "epoch": 0.9230769230769231, "grad_norm": 7.054372787475586, "learning_rate": 1.6666666666666667e-08, "loss": 0.6672, "step": 6 }, { "epoch": 0.9230769230769231, "eval_accuracy": 0.42124542124542125, "eval_loss": 0.6979768872261047, "eval_runtime": 1.1041, "eval_samples_per_second": 247.254, "eval_steps_per_second": 4.528, "step": 6 }, { "epoch": 1.0769230769230769, "grad_norm": 7.790348529815674, "learning_rate": 1.9444444444444445e-08, "loss": 0.6725, "step": 7 }, { "epoch": 1.2307692307692308, "grad_norm": 13.414072036743164, "learning_rate": 2.222222222222222e-08, "loss": 0.6652, "step": 8 }, { "epoch": 1.3846153846153846, "grad_norm": 8.81810474395752, "learning_rate": 2.5e-08, "loss": 0.6698, "step": 9 }, { "epoch": 1.5384615384615383, "grad_norm": 7.751201152801514, "learning_rate": 2.7777777777777774e-08, "loss": 0.6817, "step": 10 }, { "epoch": 1.6923076923076923, "grad_norm": 8.985770225524902, "learning_rate": 3.0555555555555556e-08, "loss": 0.6661, "step": 11 }, { "epoch": 1.8461538461538463, "grad_norm": 6.888631820678711, "learning_rate": 3.3333333333333334e-08, "loss": 0.6712, "step": 12 }, { "epoch": 2.0, "grad_norm": 8.5791597366333, "learning_rate": 3.6111111111111106e-08, "loss": 0.6617, "step": 13 }, { "epoch": 2.0, "eval_accuracy": 0.4249084249084249, "eval_loss": 0.6965357065200806, "eval_runtime": 1.1521, "eval_samples_per_second": 236.95, "eval_steps_per_second": 4.34, "step": 13 }, { "epoch": 2.1538461538461537, "grad_norm": 9.721835136413574, "learning_rate": 3.888888888888889e-08, "loss": 0.665, "step": 14 }, { "epoch": 2.3076923076923075, "grad_norm": 10.525633811950684, "learning_rate": 4.166666666666666e-08, "loss": 0.6629, "step": 15 }, { "epoch": 2.4615384615384617, "grad_norm": 5.860161304473877, "learning_rate": 4.444444444444444e-08, "loss": 0.686, "step": 16 }, { "epoch": 2.6153846153846154, "grad_norm": 7.980571269989014, "learning_rate": 4.722222222222222e-08, "loss": 0.6638, "step": 17 }, { "epoch": 2.769230769230769, "grad_norm": 6.91188383102417, "learning_rate": 5e-08, "loss": 0.6586, "step": 18 }, { "epoch": 2.9230769230769234, "grad_norm": 8.952598571777344, "learning_rate": 5.2777777777777776e-08, "loss": 0.6699, "step": 19 }, { "epoch": 2.9230769230769234, "eval_accuracy": 0.43956043956043955, "eval_loss": 0.6944313049316406, "eval_runtime": 1.1568, "eval_samples_per_second": 235.989, "eval_steps_per_second": 4.322, "step": 19 }, { "epoch": 3.076923076923077, "grad_norm": 16.572885513305664, "learning_rate": 5.555555555555555e-08, "loss": 0.6761, "step": 20 }, { "epoch": 3.230769230769231, "grad_norm": 10.379450798034668, "learning_rate": 5.833333333333333e-08, "loss": 0.6757, "step": 21 }, { "epoch": 3.3846153846153846, "grad_norm": 8.093114852905273, "learning_rate": 6.111111111111111e-08, "loss": 0.6572, "step": 22 }, { "epoch": 3.5384615384615383, "grad_norm": 12.009868621826172, "learning_rate": 6.388888888888888e-08, "loss": 0.6781, "step": 23 }, { "epoch": 3.6923076923076925, "grad_norm": 6.616479396820068, "learning_rate": 6.666666666666667e-08, "loss": 0.6519, "step": 24 }, { "epoch": 3.8461538461538463, "grad_norm": 4.752511024475098, "learning_rate": 6.944444444444444e-08, "loss": 0.669, "step": 25 }, { "epoch": 4.0, "grad_norm": 6.52732515335083, "learning_rate": 7.222222222222221e-08, "loss": 0.662, "step": 26 }, { "epoch": 4.0, "eval_accuracy": 0.43956043956043955, "eval_loss": 0.6909541487693787, "eval_runtime": 1.1432, "eval_samples_per_second": 238.799, "eval_steps_per_second": 4.374, "step": 26 }, { "epoch": 4.153846153846154, "grad_norm": 10.048100471496582, "learning_rate": 7.5e-08, "loss": 0.6616, "step": 27 }, { "epoch": 4.3076923076923075, "grad_norm": 9.803552627563477, "learning_rate": 7.777777777777778e-08, "loss": 0.672, "step": 28 }, { "epoch": 4.461538461538462, "grad_norm": 4.989064693450928, "learning_rate": 8.055555555555555e-08, "loss": 0.6615, "step": 29 }, { "epoch": 4.615384615384615, "grad_norm": 9.584320068359375, "learning_rate": 8.333333333333333e-08, "loss": 0.6534, "step": 30 }, { "epoch": 4.769230769230769, "grad_norm": 8.697301864624023, "learning_rate": 8.611111111111111e-08, "loss": 0.6592, "step": 31 }, { "epoch": 4.923076923076923, "grad_norm": 6.305507183074951, "learning_rate": 8.888888888888888e-08, "loss": 0.6548, "step": 32 }, { "epoch": 4.923076923076923, "eval_accuracy": 0.45787545787545786, "eval_loss": 0.6873242855072021, "eval_runtime": 1.0879, "eval_samples_per_second": 250.944, "eval_steps_per_second": 4.596, "step": 32 }, { "epoch": 5.076923076923077, "grad_norm": 8.161693572998047, "learning_rate": 9.166666666666665e-08, "loss": 0.6463, "step": 33 }, { "epoch": 5.230769230769231, "grad_norm": 5.462063789367676, "learning_rate": 9.444444444444444e-08, "loss": 0.6532, "step": 34 }, { "epoch": 5.384615384615385, "grad_norm": 7.397779941558838, "learning_rate": 9.722222222222222e-08, "loss": 0.6447, "step": 35 }, { "epoch": 5.538461538461538, "grad_norm": 9.939643859863281, "learning_rate": 1e-07, "loss": 0.6484, "step": 36 }, { "epoch": 5.6923076923076925, "grad_norm": 6.816234111785889, "learning_rate": 1.0277777777777777e-07, "loss": 0.6558, "step": 37 }, { "epoch": 5.846153846153846, "grad_norm": 11.238946914672852, "learning_rate": 1.0555555555555555e-07, "loss": 0.6669, "step": 38 }, { "epoch": 6.0, "grad_norm": 8.005824089050293, "learning_rate": 1.0833333333333334e-07, "loss": 0.6541, "step": 39 }, { "epoch": 6.0, "eval_accuracy": 0.4835164835164835, "eval_loss": 0.6825475692749023, "eval_runtime": 1.1582, "eval_samples_per_second": 235.72, "eval_steps_per_second": 4.317, "step": 39 }, { "epoch": 6.153846153846154, "grad_norm": 6.5383524894714355, "learning_rate": 1.111111111111111e-07, "loss": 0.6317, "step": 40 }, { "epoch": 6.3076923076923075, "grad_norm": 4.98549747467041, "learning_rate": 1.1388888888888888e-07, "loss": 0.6609, "step": 41 }, { "epoch": 6.461538461538462, "grad_norm": 6.8932037353515625, "learning_rate": 1.1666666666666667e-07, "loss": 0.6459, "step": 42 }, { "epoch": 6.615384615384615, "grad_norm": 6.804798126220703, "learning_rate": 1.1944444444444445e-07, "loss": 0.6628, "step": 43 }, { "epoch": 6.769230769230769, "grad_norm": 7.5995707511901855, "learning_rate": 1.2222222222222222e-07, "loss": 0.6501, "step": 44 }, { "epoch": 6.923076923076923, "grad_norm": 10.426745414733887, "learning_rate": 1.25e-07, "loss": 0.6222, "step": 45 }, { "epoch": 6.923076923076923, "eval_accuracy": 0.5311355311355311, "eval_loss": 0.6776841282844543, "eval_runtime": 1.1748, "eval_samples_per_second": 232.383, "eval_steps_per_second": 4.256, "step": 45 }, { "epoch": 7.076923076923077, "grad_norm": 6.766184329986572, "learning_rate": 1.2777777777777777e-07, "loss": 0.6452, "step": 46 }, { "epoch": 7.230769230769231, "grad_norm": 10.556381225585938, "learning_rate": 1.3055555555555556e-07, "loss": 0.6157, "step": 47 }, { "epoch": 7.384615384615385, "grad_norm": 9.157587051391602, "learning_rate": 1.3333333333333334e-07, "loss": 0.6313, "step": 48 }, { "epoch": 7.538461538461538, "grad_norm": 6.186281204223633, "learning_rate": 1.3611111111111108e-07, "loss": 0.6773, "step": 49 }, { "epoch": 7.6923076923076925, "grad_norm": 5.614987373352051, "learning_rate": 1.3888888888888888e-07, "loss": 0.6474, "step": 50 }, { "epoch": 7.846153846153846, "grad_norm": 5.66176700592041, "learning_rate": 1.4166666666666665e-07, "loss": 0.6463, "step": 51 }, { "epoch": 8.0, "grad_norm": 4.733225345611572, "learning_rate": 1.4444444444444442e-07, "loss": 0.6555, "step": 52 }, { "epoch": 8.0, "eval_accuracy": 0.5421245421245421, "eval_loss": 0.6718742251396179, "eval_runtime": 1.1693, "eval_samples_per_second": 233.475, "eval_steps_per_second": 4.276, "step": 52 }, { "epoch": 8.153846153846153, "grad_norm": 5.067059516906738, "learning_rate": 1.4722222222222222e-07, "loss": 0.6388, "step": 53 }, { "epoch": 8.307692307692308, "grad_norm": 7.690587520599365, "learning_rate": 1.5e-07, "loss": 0.6139, "step": 54 }, { "epoch": 8.461538461538462, "grad_norm": 4.471611976623535, "learning_rate": 1.527777777777778e-07, "loss": 0.64, "step": 55 }, { "epoch": 8.615384615384615, "grad_norm": 6.964099884033203, "learning_rate": 1.5555555555555556e-07, "loss": 0.6399, "step": 56 }, { "epoch": 8.76923076923077, "grad_norm": 4.763670444488525, "learning_rate": 1.583333333333333e-07, "loss": 0.6176, "step": 57 }, { "epoch": 8.923076923076923, "grad_norm": 7.8895063400268555, "learning_rate": 1.611111111111111e-07, "loss": 0.6226, "step": 58 }, { "epoch": 8.923076923076923, "eval_accuracy": 0.5860805860805861, "eval_loss": 0.6665313839912415, "eval_runtime": 1.1415, "eval_samples_per_second": 239.164, "eval_steps_per_second": 4.38, "step": 58 }, { "epoch": 9.076923076923077, "grad_norm": 5.872027397155762, "learning_rate": 1.6388888888888888e-07, "loss": 0.6097, "step": 59 }, { "epoch": 9.23076923076923, "grad_norm": 4.506241798400879, "learning_rate": 1.6666666666666665e-07, "loss": 0.6481, "step": 60 }, { "epoch": 9.384615384615385, "grad_norm": 6.888943672180176, "learning_rate": 1.6944444444444445e-07, "loss": 0.6135, "step": 61 }, { "epoch": 9.538461538461538, "grad_norm": 5.079667568206787, "learning_rate": 1.7222222222222222e-07, "loss": 0.6268, "step": 62 }, { "epoch": 9.692307692307692, "grad_norm": 5.653709411621094, "learning_rate": 1.75e-07, "loss": 0.6259, "step": 63 }, { "epoch": 9.846153846153847, "grad_norm": 5.669986724853516, "learning_rate": 1.7777777777777776e-07, "loss": 0.6195, "step": 64 }, { "epoch": 10.0, "grad_norm": 5.195782661437988, "learning_rate": 1.8055555555555554e-07, "loss": 0.5989, "step": 65 }, { "epoch": 10.0, "eval_accuracy": 0.6153846153846154, "eval_loss": 0.6603101491928101, "eval_runtime": 1.1943, "eval_samples_per_second": 228.592, "eval_steps_per_second": 4.187, "step": 65 }, { "epoch": 10.153846153846153, "grad_norm": 5.164191722869873, "learning_rate": 1.833333333333333e-07, "loss": 0.6329, "step": 66 }, { "epoch": 10.307692307692308, "grad_norm": 5.711167812347412, "learning_rate": 1.861111111111111e-07, "loss": 0.6198, "step": 67 }, { "epoch": 10.461538461538462, "grad_norm": 5.415841102600098, "learning_rate": 1.8888888888888888e-07, "loss": 0.6162, "step": 68 }, { "epoch": 10.615384615384615, "grad_norm": 5.584977626800537, "learning_rate": 1.9166666666666668e-07, "loss": 0.626, "step": 69 }, { "epoch": 10.76923076923077, "grad_norm": 4.149146556854248, "learning_rate": 1.9444444444444445e-07, "loss": 0.6246, "step": 70 }, { "epoch": 10.923076923076923, "grad_norm": 8.436751365661621, "learning_rate": 1.9722222222222222e-07, "loss": 0.5754, "step": 71 }, { "epoch": 10.923076923076923, "eval_accuracy": 0.6263736263736264, "eval_loss": 0.6555379629135132, "eval_runtime": 1.1482, "eval_samples_per_second": 237.759, "eval_steps_per_second": 4.355, "step": 71 }, { "epoch": 11.076923076923077, "grad_norm": 5.09835147857666, "learning_rate": 2e-07, "loss": 0.6153, "step": 72 }, { "epoch": 11.23076923076923, "grad_norm": 6.399817943572998, "learning_rate": 2.0277777777777776e-07, "loss": 0.6001, "step": 73 }, { "epoch": 11.384615384615385, "grad_norm": 4.717789173126221, "learning_rate": 2.0555555555555553e-07, "loss": 0.6009, "step": 74 }, { "epoch": 11.538461538461538, "grad_norm": 5.399771213531494, "learning_rate": 2.0833333333333333e-07, "loss": 0.5998, "step": 75 }, { "epoch": 11.692307692307692, "grad_norm": 5.429381370544434, "learning_rate": 2.111111111111111e-07, "loss": 0.5892, "step": 76 }, { "epoch": 11.846153846153847, "grad_norm": 4.105190277099609, "learning_rate": 2.1388888888888888e-07, "loss": 0.6111, "step": 77 }, { "epoch": 12.0, "grad_norm": 6.455173969268799, "learning_rate": 2.1666666666666667e-07, "loss": 0.6251, "step": 78 }, { "epoch": 12.0, "eval_accuracy": 0.6483516483516484, "eval_loss": 0.6492875218391418, "eval_runtime": 1.1131, "eval_samples_per_second": 245.251, "eval_steps_per_second": 4.492, "step": 78 }, { "epoch": 12.153846153846153, "grad_norm": 6.023905277252197, "learning_rate": 2.1944444444444442e-07, "loss": 0.6211, "step": 79 }, { "epoch": 12.307692307692308, "grad_norm": 4.803109645843506, "learning_rate": 2.222222222222222e-07, "loss": 0.5972, "step": 80 }, { "epoch": 12.461538461538462, "grad_norm": 4.598735332489014, "learning_rate": 2.25e-07, "loss": 0.5978, "step": 81 }, { "epoch": 12.615384615384615, "grad_norm": 5.137476444244385, "learning_rate": 2.2777777777777776e-07, "loss": 0.5878, "step": 82 }, { "epoch": 12.76923076923077, "grad_norm": 5.255553245544434, "learning_rate": 2.3055555555555556e-07, "loss": 0.59, "step": 83 }, { "epoch": 12.923076923076923, "grad_norm": 4.83677864074707, "learning_rate": 2.3333333333333333e-07, "loss": 0.5796, "step": 84 }, { "epoch": 12.923076923076923, "eval_accuracy": 0.6666666666666666, "eval_loss": 0.6446050405502319, "eval_runtime": 1.1065, "eval_samples_per_second": 246.726, "eval_steps_per_second": 4.519, "step": 84 }, { "epoch": 13.076923076923077, "grad_norm": 5.2046403884887695, "learning_rate": 2.361111111111111e-07, "loss": 0.5808, "step": 85 }, { "epoch": 13.23076923076923, "grad_norm": 5.6977057456970215, "learning_rate": 2.388888888888889e-07, "loss": 0.5921, "step": 86 }, { "epoch": 13.384615384615385, "grad_norm": 4.75449800491333, "learning_rate": 2.4166666666666665e-07, "loss": 0.576, "step": 87 }, { "epoch": 13.538461538461538, "grad_norm": 4.761056423187256, "learning_rate": 2.4444444444444445e-07, "loss": 0.6126, "step": 88 }, { "epoch": 13.692307692307692, "grad_norm": 4.913057327270508, "learning_rate": 2.4722222222222224e-07, "loss": 0.5814, "step": 89 }, { "epoch": 13.846153846153847, "grad_norm": 7.290613651275635, "learning_rate": 2.5e-07, "loss": 0.5803, "step": 90 }, { "epoch": 14.0, "grad_norm": 6.033799171447754, "learning_rate": 2.5277777777777773e-07, "loss": 0.5763, "step": 91 }, { "epoch": 14.0, "eval_accuracy": 0.6666666666666666, "eval_loss": 0.6390318274497986, "eval_runtime": 1.1176, "eval_samples_per_second": 244.273, "eval_steps_per_second": 4.474, "step": 91 }, { "epoch": 14.153846153846153, "grad_norm": 5.3400678634643555, "learning_rate": 2.5555555555555553e-07, "loss": 0.5817, "step": 92 }, { "epoch": 14.307692307692308, "grad_norm": 3.9350779056549072, "learning_rate": 2.5833333333333333e-07, "loss": 0.588, "step": 93 }, { "epoch": 14.461538461538462, "grad_norm": 4.339548110961914, "learning_rate": 2.6111111111111113e-07, "loss": 0.5964, "step": 94 }, { "epoch": 14.615384615384615, "grad_norm": 5.889684677124023, "learning_rate": 2.638888888888889e-07, "loss": 0.5636, "step": 95 }, { "epoch": 14.76923076923077, "grad_norm": 4.385285377502441, "learning_rate": 2.6666666666666667e-07, "loss": 0.5898, "step": 96 }, { "epoch": 14.923076923076923, "grad_norm": 7.42651891708374, "learning_rate": 2.694444444444444e-07, "loss": 0.5952, "step": 97 }, { "epoch": 14.923076923076923, "eval_accuracy": 0.684981684981685, "eval_loss": 0.6333425045013428, "eval_runtime": 1.0931, "eval_samples_per_second": 249.747, "eval_steps_per_second": 4.574, "step": 97 }, { "epoch": 15.076923076923077, "grad_norm": 7.099740028381348, "learning_rate": 2.7222222222222216e-07, "loss": 0.5412, "step": 98 }, { "epoch": 15.23076923076923, "grad_norm": 5.428436279296875, "learning_rate": 2.75e-07, "loss": 0.5645, "step": 99 }, { "epoch": 15.384615384615385, "grad_norm": 5.187060832977295, "learning_rate": 2.7777777777777776e-07, "loss": 0.5795, "step": 100 }, { "epoch": 15.538461538461538, "grad_norm": 4.902520656585693, "learning_rate": 2.8055555555555556e-07, "loss": 0.5459, "step": 101 }, { "epoch": 15.692307692307692, "grad_norm": 4.782113075256348, "learning_rate": 2.833333333333333e-07, "loss": 0.577, "step": 102 }, { "epoch": 15.846153846153847, "grad_norm": 6.622628688812256, "learning_rate": 2.861111111111111e-07, "loss": 0.5583, "step": 103 }, { "epoch": 16.0, "grad_norm": 4.653256416320801, "learning_rate": 2.8888888888888885e-07, "loss": 0.5675, "step": 104 }, { "epoch": 16.0, "eval_accuracy": 0.7032967032967034, "eval_loss": 0.6269450783729553, "eval_runtime": 1.2331, "eval_samples_per_second": 221.389, "eval_steps_per_second": 4.055, "step": 104 }, { "epoch": 16.153846153846153, "grad_norm": 5.335786819458008, "learning_rate": 2.916666666666667e-07, "loss": 0.5492, "step": 105 }, { "epoch": 16.307692307692307, "grad_norm": 6.031639575958252, "learning_rate": 2.9444444444444444e-07, "loss": 0.5475, "step": 106 }, { "epoch": 16.46153846153846, "grad_norm": 8.427881240844727, "learning_rate": 2.972222222222222e-07, "loss": 0.5995, "step": 107 }, { "epoch": 16.615384615384617, "grad_norm": 4.41181755065918, "learning_rate": 3e-07, "loss": 0.5458, "step": 108 }, { "epoch": 16.76923076923077, "grad_norm": 5.0706257820129395, "learning_rate": 3.0277777777777773e-07, "loss": 0.5739, "step": 109 }, { "epoch": 16.923076923076923, "grad_norm": 5.834512233734131, "learning_rate": 3.055555555555556e-07, "loss": 0.5453, "step": 110 }, { "epoch": 16.923076923076923, "eval_accuracy": 0.7106227106227107, "eval_loss": 0.6210848689079285, "eval_runtime": 1.1361, "eval_samples_per_second": 240.302, "eval_steps_per_second": 4.401, "step": 110 }, { "epoch": 17.076923076923077, "grad_norm": 4.99503231048584, "learning_rate": 3.0833333333333333e-07, "loss": 0.5517, "step": 111 }, { "epoch": 17.23076923076923, "grad_norm": 4.0909833908081055, "learning_rate": 3.111111111111111e-07, "loss": 0.5664, "step": 112 }, { "epoch": 17.384615384615383, "grad_norm": 4.5490851402282715, "learning_rate": 3.1388888888888887e-07, "loss": 0.541, "step": 113 }, { "epoch": 17.53846153846154, "grad_norm": 4.016046524047852, "learning_rate": 3.166666666666666e-07, "loss": 0.5788, "step": 114 }, { "epoch": 17.692307692307693, "grad_norm": 5.587233066558838, "learning_rate": 3.194444444444444e-07, "loss": 0.5155, "step": 115 }, { "epoch": 17.846153846153847, "grad_norm": 5.020046234130859, "learning_rate": 3.222222222222222e-07, "loss": 0.5386, "step": 116 }, { "epoch": 18.0, "grad_norm": 6.417337417602539, "learning_rate": 3.25e-07, "loss": 0.5199, "step": 117 }, { "epoch": 18.0, "eval_accuracy": 0.7142857142857143, "eval_loss": 0.6150110363960266, "eval_runtime": 1.1307, "eval_samples_per_second": 241.446, "eval_steps_per_second": 4.422, "step": 117 }, { "epoch": 18.153846153846153, "grad_norm": 5.794677257537842, "learning_rate": 3.2777777777777776e-07, "loss": 0.5343, "step": 118 }, { "epoch": 18.307692307692307, "grad_norm": 4.628763198852539, "learning_rate": 3.3055555555555556e-07, "loss": 0.5275, "step": 119 }, { "epoch": 18.46153846153846, "grad_norm": 7.958808422088623, "learning_rate": 3.333333333333333e-07, "loss": 0.5252, "step": 120 }, { "epoch": 18.615384615384617, "grad_norm": 4.4327826499938965, "learning_rate": 3.361111111111111e-07, "loss": 0.5233, "step": 121 }, { "epoch": 18.76923076923077, "grad_norm": 5.422006130218506, "learning_rate": 3.388888888888889e-07, "loss": 0.5646, "step": 122 }, { "epoch": 18.923076923076923, "grad_norm": 4.5244975090026855, "learning_rate": 3.4166666666666664e-07, "loss": 0.541, "step": 123 }, { "epoch": 18.923076923076923, "eval_accuracy": 0.7216117216117216, "eval_loss": 0.6090343594551086, "eval_runtime": 1.2237, "eval_samples_per_second": 223.1, "eval_steps_per_second": 4.086, "step": 123 }, { "epoch": 19.076923076923077, "grad_norm": 6.068373680114746, "learning_rate": 3.4444444444444444e-07, "loss": 0.5179, "step": 124 }, { "epoch": 19.23076923076923, "grad_norm": 4.209527492523193, "learning_rate": 3.472222222222222e-07, "loss": 0.5533, "step": 125 }, { "epoch": 19.384615384615383, "grad_norm": 5.68998384475708, "learning_rate": 3.5e-07, "loss": 0.5219, "step": 126 }, { "epoch": 19.53846153846154, "grad_norm": 5.829248428344727, "learning_rate": 3.527777777777778e-07, "loss": 0.503, "step": 127 }, { "epoch": 19.692307692307693, "grad_norm": 4.40165376663208, "learning_rate": 3.5555555555555553e-07, "loss": 0.5483, "step": 128 }, { "epoch": 19.846153846153847, "grad_norm": 4.628623962402344, "learning_rate": 3.583333333333333e-07, "loss": 0.5249, "step": 129 }, { "epoch": 20.0, "grad_norm": 4.771780014038086, "learning_rate": 3.6111111111111107e-07, "loss": 0.5273, "step": 130 }, { "epoch": 20.0, "eval_accuracy": 0.7289377289377289, "eval_loss": 0.6007034182548523, "eval_runtime": 1.1281, "eval_samples_per_second": 241.989, "eval_steps_per_second": 4.432, "step": 130 }, { "epoch": 20.153846153846153, "grad_norm": 4.505945682525635, "learning_rate": 3.6388888888888887e-07, "loss": 0.515, "step": 131 }, { "epoch": 20.307692307692307, "grad_norm": 7.482431411743164, "learning_rate": 3.666666666666666e-07, "loss": 0.4948, "step": 132 }, { "epoch": 20.46153846153846, "grad_norm": 4.583889007568359, "learning_rate": 3.6944444444444447e-07, "loss": 0.5215, "step": 133 }, { "epoch": 20.615384615384617, "grad_norm": 5.050055980682373, "learning_rate": 3.722222222222222e-07, "loss": 0.502, "step": 134 }, { "epoch": 20.76923076923077, "grad_norm": 5.0981245040893555, "learning_rate": 3.75e-07, "loss": 0.5343, "step": 135 }, { "epoch": 20.923076923076923, "grad_norm": 7.6524529457092285, "learning_rate": 3.7777777777777775e-07, "loss": 0.495, "step": 136 }, { "epoch": 20.923076923076923, "eval_accuracy": 0.7289377289377289, "eval_loss": 0.5934087634086609, "eval_runtime": 1.1036, "eval_samples_per_second": 247.368, "eval_steps_per_second": 4.531, "step": 136 }, { "epoch": 21.076923076923077, "grad_norm": 5.747613906860352, "learning_rate": 3.805555555555555e-07, "loss": 0.4929, "step": 137 }, { "epoch": 21.23076923076923, "grad_norm": 8.28075122833252, "learning_rate": 3.8333333333333335e-07, "loss": 0.5302, "step": 138 }, { "epoch": 21.384615384615383, "grad_norm": 5.376802921295166, "learning_rate": 3.861111111111111e-07, "loss": 0.547, "step": 139 }, { "epoch": 21.53846153846154, "grad_norm": 4.9347968101501465, "learning_rate": 3.888888888888889e-07, "loss": 0.4807, "step": 140 }, { "epoch": 21.692307692307693, "grad_norm": 6.574237823486328, "learning_rate": 3.9166666666666664e-07, "loss": 0.4839, "step": 141 }, { "epoch": 21.846153846153847, "grad_norm": 4.990074157714844, "learning_rate": 3.9444444444444444e-07, "loss": 0.4937, "step": 142 }, { "epoch": 22.0, "grad_norm": 6.261688232421875, "learning_rate": 3.972222222222222e-07, "loss": 0.4855, "step": 143 }, { "epoch": 22.0, "eval_accuracy": 0.7472527472527473, "eval_loss": 0.5855301022529602, "eval_runtime": 1.1327, "eval_samples_per_second": 241.009, "eval_steps_per_second": 4.414, "step": 143 }, { "epoch": 22.153846153846153, "grad_norm": 5.447592258453369, "learning_rate": 4e-07, "loss": 0.4225, "step": 144 }, { "epoch": 22.307692307692307, "grad_norm": 4.675624847412109, "learning_rate": 4.027777777777778e-07, "loss": 0.4982, "step": 145 }, { "epoch": 22.46153846153846, "grad_norm": 4.06153678894043, "learning_rate": 4.055555555555555e-07, "loss": 0.4751, "step": 146 }, { "epoch": 22.615384615384617, "grad_norm": 7.109158039093018, "learning_rate": 4.083333333333333e-07, "loss": 0.4232, "step": 147 }, { "epoch": 22.76923076923077, "grad_norm": 7.506172180175781, "learning_rate": 4.1111111111111107e-07, "loss": 0.4935, "step": 148 }, { "epoch": 22.923076923076923, "grad_norm": 4.346311092376709, "learning_rate": 4.1388888888888887e-07, "loss": 0.4763, "step": 149 }, { "epoch": 22.923076923076923, "eval_accuracy": 0.7362637362637363, "eval_loss": 0.5787296295166016, "eval_runtime": 1.163, "eval_samples_per_second": 234.74, "eval_steps_per_second": 4.299, "step": 149 }, { "epoch": 23.076923076923077, "grad_norm": 7.535876750946045, "learning_rate": 4.1666666666666667e-07, "loss": 0.4539, "step": 150 }, { "epoch": 23.23076923076923, "grad_norm": 4.603224277496338, "learning_rate": 4.194444444444444e-07, "loss": 0.491, "step": 151 }, { "epoch": 23.384615384615383, "grad_norm": 4.099734783172607, "learning_rate": 4.222222222222222e-07, "loss": 0.4877, "step": 152 }, { "epoch": 23.53846153846154, "grad_norm": 6.270420551300049, "learning_rate": 4.2499999999999995e-07, "loss": 0.4791, "step": 153 }, { "epoch": 23.692307692307693, "grad_norm": 5.5756330490112305, "learning_rate": 4.2777777777777775e-07, "loss": 0.4192, "step": 154 }, { "epoch": 23.846153846153847, "grad_norm": 4.492015838623047, "learning_rate": 4.3055555555555555e-07, "loss": 0.429, "step": 155 }, { "epoch": 24.0, "grad_norm": 5.119944095611572, "learning_rate": 4.3333333333333335e-07, "loss": 0.4287, "step": 156 }, { "epoch": 24.0, "eval_accuracy": 0.7509157509157509, "eval_loss": 0.5693350434303284, "eval_runtime": 1.2614, "eval_samples_per_second": 216.434, "eval_steps_per_second": 3.964, "step": 156 }, { "epoch": 24.153846153846153, "grad_norm": 6.68319034576416, "learning_rate": 4.361111111111111e-07, "loss": 0.4549, "step": 157 }, { "epoch": 24.307692307692307, "grad_norm": 6.93166446685791, "learning_rate": 4.3888888888888884e-07, "loss": 0.4878, "step": 158 }, { "epoch": 24.46153846153846, "grad_norm": 4.761252403259277, "learning_rate": 4.4166666666666664e-07, "loss": 0.463, "step": 159 }, { "epoch": 24.615384615384617, "grad_norm": 5.771098613739014, "learning_rate": 4.444444444444444e-07, "loss": 0.4418, "step": 160 }, { "epoch": 24.76923076923077, "grad_norm": 5.692720413208008, "learning_rate": 4.4722222222222223e-07, "loss": 0.4211, "step": 161 }, { "epoch": 24.923076923076923, "grad_norm": 10.276296615600586, "learning_rate": 4.5e-07, "loss": 0.445, "step": 162 }, { "epoch": 24.923076923076923, "eval_accuracy": 0.7692307692307693, "eval_loss": 0.5618556141853333, "eval_runtime": 1.2168, "eval_samples_per_second": 224.355, "eval_steps_per_second": 4.109, "step": 162 }, { "epoch": 25.076923076923077, "grad_norm": 6.790480613708496, "learning_rate": 4.527777777777778e-07, "loss": 0.4313, "step": 163 }, { "epoch": 25.23076923076923, "grad_norm": 4.737351417541504, "learning_rate": 4.555555555555555e-07, "loss": 0.4303, "step": 164 }, { "epoch": 25.384615384615383, "grad_norm": 4.519160747528076, "learning_rate": 4.5833333333333327e-07, "loss": 0.455, "step": 165 }, { "epoch": 25.53846153846154, "grad_norm": 5.933927536010742, "learning_rate": 4.611111111111111e-07, "loss": 0.4461, "step": 166 }, { "epoch": 25.692307692307693, "grad_norm": 7.9362664222717285, "learning_rate": 4.6388888888888886e-07, "loss": 0.3951, "step": 167 }, { "epoch": 25.846153846153847, "grad_norm": 9.995780944824219, "learning_rate": 4.6666666666666666e-07, "loss": 0.4142, "step": 168 }, { "epoch": 26.0, "grad_norm": 6.652438163757324, "learning_rate": 4.694444444444444e-07, "loss": 0.4343, "step": 169 }, { "epoch": 26.0, "eval_accuracy": 0.7802197802197802, "eval_loss": 0.5539770126342773, "eval_runtime": 1.1366, "eval_samples_per_second": 240.189, "eval_steps_per_second": 4.399, "step": 169 }, { "epoch": 26.153846153846153, "grad_norm": 4.573406219482422, "learning_rate": 4.722222222222222e-07, "loss": 0.4279, "step": 170 }, { "epoch": 26.307692307692307, "grad_norm": 6.939181327819824, "learning_rate": 4.7499999999999995e-07, "loss": 0.443, "step": 171 }, { "epoch": 26.46153846153846, "grad_norm": 10.387896537780762, "learning_rate": 4.777777777777778e-07, "loss": 0.4462, "step": 172 }, { "epoch": 26.615384615384617, "grad_norm": 6.387257099151611, "learning_rate": 4.805555555555555e-07, "loss": 0.4221, "step": 173 }, { "epoch": 26.76923076923077, "grad_norm": 5.768893241882324, "learning_rate": 4.833333333333333e-07, "loss": 0.4048, "step": 174 }, { "epoch": 26.923076923076923, "grad_norm": 6.23684024810791, "learning_rate": 4.861111111111111e-07, "loss": 0.3748, "step": 175 }, { "epoch": 26.923076923076923, "eval_accuracy": 0.7875457875457875, "eval_loss": 0.5466815829277039, "eval_runtime": 1.1181, "eval_samples_per_second": 244.171, "eval_steps_per_second": 4.472, "step": 175 }, { "epoch": 27.076923076923077, "grad_norm": 6.4479146003723145, "learning_rate": 4.888888888888889e-07, "loss": 0.3923, "step": 176 }, { "epoch": 27.23076923076923, "grad_norm": 8.554872512817383, "learning_rate": 4.916666666666666e-07, "loss": 0.4114, "step": 177 }, { "epoch": 27.384615384615383, "grad_norm": 6.017930030822754, "learning_rate": 4.944444444444445e-07, "loss": 0.3798, "step": 178 }, { "epoch": 27.53846153846154, "grad_norm": 8.284281730651855, "learning_rate": 4.972222222222222e-07, "loss": 0.4146, "step": 179 }, { "epoch": 27.692307692307693, "grad_norm": 9.119588851928711, "learning_rate": 5e-07, "loss": 0.4436, "step": 180 }, { "epoch": 27.846153846153847, "grad_norm": 9.632242202758789, "learning_rate": 4.988095238095238e-07, "loss": 0.4031, "step": 181 }, { "epoch": 28.0, "grad_norm": 16.935251235961914, "learning_rate": 4.976190476190476e-07, "loss": 0.4041, "step": 182 }, { "epoch": 28.0, "eval_accuracy": 0.8021978021978022, "eval_loss": 0.5421282052993774, "eval_runtime": 1.1482, "eval_samples_per_second": 237.764, "eval_steps_per_second": 4.355, "step": 182 }, { "epoch": 28.153846153846153, "grad_norm": 5.241950511932373, "learning_rate": 4.964285714285715e-07, "loss": 0.4528, "step": 183 }, { "epoch": 28.307692307692307, "grad_norm": 11.666001319885254, "learning_rate": 4.952380952380952e-07, "loss": 0.3889, "step": 184 }, { "epoch": 28.46153846153846, "grad_norm": 9.493650436401367, "learning_rate": 4.94047619047619e-07, "loss": 0.3956, "step": 185 }, { "epoch": 28.615384615384617, "grad_norm": 11.480545043945312, "learning_rate": 4.928571428571429e-07, "loss": 0.3756, "step": 186 }, { "epoch": 28.76923076923077, "grad_norm": 18.468042373657227, "learning_rate": 4.916666666666666e-07, "loss": 0.401, "step": 187 }, { "epoch": 28.923076923076923, "grad_norm": 8.127854347229004, "learning_rate": 4.904761904761904e-07, "loss": 0.3543, "step": 188 }, { "epoch": 28.923076923076923, "eval_accuracy": 0.8205128205128205, "eval_loss": 0.529083788394928, "eval_runtime": 1.1719, "eval_samples_per_second": 232.959, "eval_steps_per_second": 4.267, "step": 188 }, { "epoch": 29.076923076923077, "grad_norm": 12.729475975036621, "learning_rate": 4.892857142857142e-07, "loss": 0.4255, "step": 189 }, { "epoch": 29.23076923076923, "grad_norm": 6.080268383026123, "learning_rate": 4.880952380952381e-07, "loss": 0.4233, "step": 190 }, { "epoch": 29.384615384615383, "grad_norm": 9.651863098144531, "learning_rate": 4.869047619047619e-07, "loss": 0.3753, "step": 191 }, { "epoch": 29.53846153846154, "grad_norm": 12.342060089111328, "learning_rate": 4.857142857142857e-07, "loss": 0.392, "step": 192 }, { "epoch": 29.692307692307693, "grad_norm": 5.0623779296875, "learning_rate": 4.845238095238095e-07, "loss": 0.3942, "step": 193 }, { "epoch": 29.846153846153847, "grad_norm": 18.967161178588867, "learning_rate": 4.833333333333333e-07, "loss": 0.3374, "step": 194 }, { "epoch": 30.0, "grad_norm": 13.24194622039795, "learning_rate": 4.821428571428571e-07, "loss": 0.3972, "step": 195 }, { "epoch": 30.0, "eval_accuracy": 0.8278388278388278, "eval_loss": 0.5134266018867493, "eval_runtime": 1.2397, "eval_samples_per_second": 220.215, "eval_steps_per_second": 4.033, "step": 195 }, { "epoch": 30.153846153846153, "grad_norm": 12.518712997436523, "learning_rate": 4.80952380952381e-07, "loss": 0.4197, "step": 196 }, { "epoch": 30.307692307692307, "grad_norm": 15.362881660461426, "learning_rate": 4.797619047619048e-07, "loss": 0.4056, "step": 197 }, { "epoch": 30.46153846153846, "grad_norm": 17.072725296020508, "learning_rate": 4.785714285714286e-07, "loss": 0.3856, "step": 198 }, { "epoch": 30.615384615384617, "grad_norm": 6.637291431427002, "learning_rate": 4.773809523809523e-07, "loss": 0.3627, "step": 199 }, { "epoch": 30.76923076923077, "grad_norm": 8.751256942749023, "learning_rate": 4.761904761904761e-07, "loss": 0.3637, "step": 200 }, { "epoch": 30.923076923076923, "grad_norm": 5.588964462280273, "learning_rate": 4.7499999999999995e-07, "loss": 0.3716, "step": 201 }, { "epoch": 30.923076923076923, "eval_accuracy": 0.8241758241758241, "eval_loss": 0.5149514079093933, "eval_runtime": 1.1326, "eval_samples_per_second": 241.045, "eval_steps_per_second": 4.415, "step": 201 }, { "epoch": 31.076923076923077, "grad_norm": 8.8916654586792, "learning_rate": 4.738095238095238e-07, "loss": 0.362, "step": 202 }, { "epoch": 31.23076923076923, "grad_norm": 7.172123908996582, "learning_rate": 4.7261904761904756e-07, "loss": 0.3541, "step": 203 }, { "epoch": 31.384615384615383, "grad_norm": 9.399160385131836, "learning_rate": 4.714285714285714e-07, "loss": 0.3629, "step": 204 }, { "epoch": 31.53846153846154, "grad_norm": 12.05125904083252, "learning_rate": 4.702380952380952e-07, "loss": 0.3414, "step": 205 }, { "epoch": 31.692307692307693, "grad_norm": 6.808493137359619, "learning_rate": 4.69047619047619e-07, "loss": 0.372, "step": 206 }, { "epoch": 31.846153846153847, "grad_norm": 12.759626388549805, "learning_rate": 4.6785714285714283e-07, "loss": 0.3574, "step": 207 }, { "epoch": 32.0, "grad_norm": 9.680743217468262, "learning_rate": 4.6666666666666666e-07, "loss": 0.3871, "step": 208 }, { "epoch": 32.0, "eval_accuracy": 0.8315018315018315, "eval_loss": 0.5100430846214294, "eval_runtime": 1.1335, "eval_samples_per_second": 240.838, "eval_steps_per_second": 4.411, "step": 208 }, { "epoch": 32.15384615384615, "grad_norm": 13.226339340209961, "learning_rate": 4.6547619047619044e-07, "loss": 0.3623, "step": 209 }, { "epoch": 32.30769230769231, "grad_norm": 9.359790802001953, "learning_rate": 4.6428571428571427e-07, "loss": 0.4045, "step": 210 }, { "epoch": 32.46153846153846, "grad_norm": 4.744467258453369, "learning_rate": 4.630952380952381e-07, "loss": 0.3852, "step": 211 }, { "epoch": 32.61538461538461, "grad_norm": 9.221460342407227, "learning_rate": 4.6190476190476193e-07, "loss": 0.3267, "step": 212 }, { "epoch": 32.76923076923077, "grad_norm": 5.6791510581970215, "learning_rate": 4.6071428571428566e-07, "loss": 0.3969, "step": 213 }, { "epoch": 32.92307692307692, "grad_norm": 13.916045188903809, "learning_rate": 4.595238095238095e-07, "loss": 0.3729, "step": 214 }, { "epoch": 32.92307692307692, "eval_accuracy": 0.8351648351648352, "eval_loss": 0.4985570013523102, "eval_runtime": 1.1523, "eval_samples_per_second": 236.921, "eval_steps_per_second": 4.339, "step": 214 }, { "epoch": 33.07692307692308, "grad_norm": 20.20587158203125, "learning_rate": 4.5833333333333327e-07, "loss": 0.3999, "step": 215 }, { "epoch": 33.23076923076923, "grad_norm": 5.628478050231934, "learning_rate": 4.571428571428571e-07, "loss": 0.333, "step": 216 }, { "epoch": 33.38461538461539, "grad_norm": 9.061274528503418, "learning_rate": 4.5595238095238093e-07, "loss": 0.3807, "step": 217 }, { "epoch": 33.53846153846154, "grad_norm": 7.302306652069092, "learning_rate": 4.5476190476190476e-07, "loss": 0.3309, "step": 218 }, { "epoch": 33.69230769230769, "grad_norm": 17.617534637451172, "learning_rate": 4.5357142857142854e-07, "loss": 0.319, "step": 219 }, { "epoch": 33.84615384615385, "grad_norm": 9.831780433654785, "learning_rate": 4.5238095238095237e-07, "loss": 0.3849, "step": 220 }, { "epoch": 34.0, "grad_norm": 10.667181968688965, "learning_rate": 4.511904761904762e-07, "loss": 0.3286, "step": 221 }, { "epoch": 34.0, "eval_accuracy": 0.8461538461538461, "eval_loss": 0.4945792853832245, "eval_runtime": 1.1795, "eval_samples_per_second": 231.451, "eval_steps_per_second": 4.239, "step": 221 }, { "epoch": 34.15384615384615, "grad_norm": 6.82171106338501, "learning_rate": 4.5e-07, "loss": 0.3129, "step": 222 }, { "epoch": 34.30769230769231, "grad_norm": 12.462568283081055, "learning_rate": 4.488095238095238e-07, "loss": 0.2947, "step": 223 }, { "epoch": 34.46153846153846, "grad_norm": 14.821395874023438, "learning_rate": 4.4761904761904764e-07, "loss": 0.381, "step": 224 }, { "epoch": 34.61538461538461, "grad_norm": 11.688921928405762, "learning_rate": 4.464285714285714e-07, "loss": 0.392, "step": 225 }, { "epoch": 34.76923076923077, "grad_norm": 16.245473861694336, "learning_rate": 4.452380952380952e-07, "loss": 0.3356, "step": 226 }, { "epoch": 34.92307692307692, "grad_norm": 8.85981559753418, "learning_rate": 4.4404761904761903e-07, "loss": 0.4261, "step": 227 }, { "epoch": 34.92307692307692, "eval_accuracy": 0.8388278388278388, "eval_loss": 0.49569690227508545, "eval_runtime": 1.1615, "eval_samples_per_second": 235.046, "eval_steps_per_second": 4.305, "step": 227 }, { "epoch": 35.07692307692308, "grad_norm": 14.444450378417969, "learning_rate": 4.428571428571428e-07, "loss": 0.3065, "step": 228 }, { "epoch": 35.23076923076923, "grad_norm": 18.541534423828125, "learning_rate": 4.4166666666666664e-07, "loss": 0.319, "step": 229 }, { "epoch": 35.38461538461539, "grad_norm": 14.79796314239502, "learning_rate": 4.4047619047619047e-07, "loss": 0.3511, "step": 230 }, { "epoch": 35.53846153846154, "grad_norm": 14.025275230407715, "learning_rate": 4.3928571428571425e-07, "loss": 0.351, "step": 231 }, { "epoch": 35.69230769230769, "grad_norm": 9.698802947998047, "learning_rate": 4.380952380952381e-07, "loss": 0.3369, "step": 232 }, { "epoch": 35.84615384615385, "grad_norm": 10.56847858428955, "learning_rate": 4.369047619047619e-07, "loss": 0.2976, "step": 233 }, { "epoch": 36.0, "grad_norm": 15.778355598449707, "learning_rate": 4.357142857142857e-07, "loss": 0.4014, "step": 234 }, { "epoch": 36.0, "eval_accuracy": 0.8534798534798534, "eval_loss": 0.48495998978614807, "eval_runtime": 1.1929, "eval_samples_per_second": 228.863, "eval_steps_per_second": 4.192, "step": 234 }, { "epoch": 36.15384615384615, "grad_norm": 16.71240234375, "learning_rate": 4.345238095238095e-07, "loss": 0.3749, "step": 235 }, { "epoch": 36.30769230769231, "grad_norm": 13.584702491760254, "learning_rate": 4.3333333333333335e-07, "loss": 0.3812, "step": 236 }, { "epoch": 36.46153846153846, "grad_norm": 8.108072280883789, "learning_rate": 4.3214285714285713e-07, "loss": 0.3024, "step": 237 }, { "epoch": 36.61538461538461, "grad_norm": 9.233282089233398, "learning_rate": 4.3095238095238096e-07, "loss": 0.3413, "step": 238 }, { "epoch": 36.76923076923077, "grad_norm": 13.080716133117676, "learning_rate": 4.297619047619048e-07, "loss": 0.2792, "step": 239 }, { "epoch": 36.92307692307692, "grad_norm": 10.88381576538086, "learning_rate": 4.285714285714285e-07, "loss": 0.3514, "step": 240 }, { "epoch": 36.92307692307692, "eval_accuracy": 0.8534798534798534, "eval_loss": 0.4806550443172455, "eval_runtime": 1.1462, "eval_samples_per_second": 238.179, "eval_steps_per_second": 4.362, "step": 240 }, { "epoch": 37.07692307692308, "grad_norm": 13.345056533813477, "learning_rate": 4.2738095238095235e-07, "loss": 0.318, "step": 241 }, { "epoch": 37.23076923076923, "grad_norm": 9.593791961669922, "learning_rate": 4.261904761904762e-07, "loss": 0.3487, "step": 242 }, { "epoch": 37.38461538461539, "grad_norm": 9.83149242401123, "learning_rate": 4.2499999999999995e-07, "loss": 0.3283, "step": 243 }, { "epoch": 37.53846153846154, "grad_norm": 11.976517677307129, "learning_rate": 4.238095238095238e-07, "loss": 0.407, "step": 244 }, { "epoch": 37.69230769230769, "grad_norm": 7.24540901184082, "learning_rate": 4.226190476190476e-07, "loss": 0.3899, "step": 245 }, { "epoch": 37.84615384615385, "grad_norm": 11.755511283874512, "learning_rate": 4.214285714285714e-07, "loss": 0.247, "step": 246 }, { "epoch": 38.0, "grad_norm": 14.939422607421875, "learning_rate": 4.202380952380952e-07, "loss": 0.3883, "step": 247 }, { "epoch": 38.0, "eval_accuracy": 0.8534798534798534, "eval_loss": 0.47668036818504333, "eval_runtime": 1.2822, "eval_samples_per_second": 212.923, "eval_steps_per_second": 3.9, "step": 247 }, { "epoch": 38.15384615384615, "grad_norm": 7.6719279289245605, "learning_rate": 4.1904761904761906e-07, "loss": 0.3579, "step": 248 }, { "epoch": 38.30769230769231, "grad_norm": 18.015718460083008, "learning_rate": 4.1785714285714283e-07, "loss": 0.3072, "step": 249 }, { "epoch": 38.46153846153846, "grad_norm": 13.246123313903809, "learning_rate": 4.1666666666666667e-07, "loss": 0.3756, "step": 250 }, { "epoch": 38.61538461538461, "grad_norm": 7.806217670440674, "learning_rate": 4.154761904761905e-07, "loss": 0.3919, "step": 251 }, { "epoch": 38.76923076923077, "grad_norm": 5.912841796875, "learning_rate": 4.142857142857143e-07, "loss": 0.3079, "step": 252 }, { "epoch": 38.92307692307692, "grad_norm": 7.757283687591553, "learning_rate": 4.1309523809523805e-07, "loss": 0.3219, "step": 253 }, { "epoch": 38.92307692307692, "eval_accuracy": 0.8534798534798534, "eval_loss": 0.4762944281101227, "eval_runtime": 1.2149, "eval_samples_per_second": 224.702, "eval_steps_per_second": 4.115, "step": 253 }, { "epoch": 39.07692307692308, "grad_norm": 7.3901238441467285, "learning_rate": 4.119047619047619e-07, "loss": 0.2908, "step": 254 }, { "epoch": 39.23076923076923, "grad_norm": 6.3056535720825195, "learning_rate": 4.1071428571428566e-07, "loss": 0.2982, "step": 255 }, { "epoch": 39.38461538461539, "grad_norm": 9.348855018615723, "learning_rate": 4.095238095238095e-07, "loss": 0.3693, "step": 256 }, { "epoch": 39.53846153846154, "grad_norm": 7.346530914306641, "learning_rate": 4.083333333333333e-07, "loss": 0.3475, "step": 257 }, { "epoch": 39.69230769230769, "grad_norm": 5.94871711730957, "learning_rate": 4.071428571428571e-07, "loss": 0.3274, "step": 258 }, { "epoch": 39.84615384615385, "grad_norm": 35.71484375, "learning_rate": 4.0595238095238093e-07, "loss": 0.3718, "step": 259 }, { "epoch": 40.0, "grad_norm": 13.31654167175293, "learning_rate": 4.0476190476190476e-07, "loss": 0.4351, "step": 260 }, { "epoch": 40.0, "eval_accuracy": 0.8571428571428571, "eval_loss": 0.47377410531044006, "eval_runtime": 1.1651, "eval_samples_per_second": 234.318, "eval_steps_per_second": 4.292, "step": 260 }, { "epoch": 40.15384615384615, "grad_norm": 8.30079460144043, "learning_rate": 4.0357142857142854e-07, "loss": 0.4084, "step": 261 }, { "epoch": 40.30769230769231, "grad_norm": 11.171014785766602, "learning_rate": 4.0238095238095237e-07, "loss": 0.2589, "step": 262 }, { "epoch": 40.46153846153846, "grad_norm": 12.395658493041992, "learning_rate": 4.011904761904762e-07, "loss": 0.3645, "step": 263 }, { "epoch": 40.61538461538461, "grad_norm": 12.52223014831543, "learning_rate": 4e-07, "loss": 0.3063, "step": 264 }, { "epoch": 40.76923076923077, "grad_norm": 22.095630645751953, "learning_rate": 3.988095238095238e-07, "loss": 0.2982, "step": 265 }, { "epoch": 40.92307692307692, "grad_norm": 19.723215103149414, "learning_rate": 3.976190476190476e-07, "loss": 0.3068, "step": 266 }, { "epoch": 40.92307692307692, "eval_accuracy": 0.8644688644688645, "eval_loss": 0.46877339482307434, "eval_runtime": 1.1413, "eval_samples_per_second": 239.204, "eval_steps_per_second": 4.381, "step": 266 }, { "epoch": 41.07692307692308, "grad_norm": 14.390270233154297, "learning_rate": 3.9642857142857137e-07, "loss": 0.3193, "step": 267 }, { "epoch": 41.23076923076923, "grad_norm": 14.494707107543945, "learning_rate": 3.952380952380952e-07, "loss": 0.3274, "step": 268 }, { "epoch": 41.38461538461539, "grad_norm": 9.31578540802002, "learning_rate": 3.9404761904761903e-07, "loss": 0.2905, "step": 269 }, { "epoch": 41.53846153846154, "grad_norm": 11.39842700958252, "learning_rate": 3.928571428571428e-07, "loss": 0.2591, "step": 270 }, { "epoch": 41.69230769230769, "grad_norm": 12.236638069152832, "learning_rate": 3.9166666666666664e-07, "loss": 0.3198, "step": 271 }, { "epoch": 41.84615384615385, "grad_norm": 14.803117752075195, "learning_rate": 3.9047619047619047e-07, "loss": 0.3718, "step": 272 }, { "epoch": 42.0, "grad_norm": 12.712557792663574, "learning_rate": 3.8928571428571425e-07, "loss": 0.3356, "step": 273 }, { "epoch": 42.0, "eval_accuracy": 0.8644688644688645, "eval_loss": 0.45851626992225647, "eval_runtime": 1.178, "eval_samples_per_second": 231.744, "eval_steps_per_second": 4.244, "step": 273 }, { "epoch": 42.15384615384615, "grad_norm": 5.399446964263916, "learning_rate": 3.880952380952381e-07, "loss": 0.3028, "step": 274 }, { "epoch": 42.30769230769231, "grad_norm": 9.010210990905762, "learning_rate": 3.869047619047619e-07, "loss": 0.2897, "step": 275 }, { "epoch": 42.46153846153846, "grad_norm": 8.666064262390137, "learning_rate": 3.857142857142857e-07, "loss": 0.3349, "step": 276 }, { "epoch": 42.61538461538461, "grad_norm": 25.002635955810547, "learning_rate": 3.845238095238095e-07, "loss": 0.3166, "step": 277 }, { "epoch": 42.76923076923077, "grad_norm": 13.861302375793457, "learning_rate": 3.8333333333333335e-07, "loss": 0.3475, "step": 278 }, { "epoch": 42.92307692307692, "grad_norm": 11.409740447998047, "learning_rate": 3.821428571428571e-07, "loss": 0.345, "step": 279 }, { "epoch": 42.92307692307692, "eval_accuracy": 0.8681318681318682, "eval_loss": 0.4540693759918213, "eval_runtime": 1.1168, "eval_samples_per_second": 244.446, "eval_steps_per_second": 4.477, "step": 279 }, { "epoch": 43.07692307692308, "grad_norm": 21.283527374267578, "learning_rate": 3.809523809523809e-07, "loss": 0.3316, "step": 280 }, { "epoch": 43.23076923076923, "grad_norm": 7.409358978271484, "learning_rate": 3.7976190476190474e-07, "loss": 0.3305, "step": 281 }, { "epoch": 43.38461538461539, "grad_norm": 11.763964653015137, "learning_rate": 3.785714285714285e-07, "loss": 0.3293, "step": 282 }, { "epoch": 43.53846153846154, "grad_norm": 5.29448127746582, "learning_rate": 3.7738095238095235e-07, "loss": 0.2883, "step": 283 }, { "epoch": 43.69230769230769, "grad_norm": 16.18635368347168, "learning_rate": 3.761904761904762e-07, "loss": 0.3549, "step": 284 }, { "epoch": 43.84615384615385, "grad_norm": 17.321565628051758, "learning_rate": 3.75e-07, "loss": 0.3016, "step": 285 }, { "epoch": 44.0, "grad_norm": 8.436405181884766, "learning_rate": 3.738095238095238e-07, "loss": 0.3254, "step": 286 }, { "epoch": 44.0, "eval_accuracy": 0.8644688644688645, "eval_loss": 0.45843759179115295, "eval_runtime": 1.1597, "eval_samples_per_second": 235.408, "eval_steps_per_second": 4.311, "step": 286 }, { "epoch": 44.15384615384615, "grad_norm": 11.177824974060059, "learning_rate": 3.726190476190476e-07, "loss": 0.3276, "step": 287 }, { "epoch": 44.30769230769231, "grad_norm": 10.337651252746582, "learning_rate": 3.7142857142857145e-07, "loss": 0.3419, "step": 288 }, { "epoch": 44.46153846153846, "grad_norm": 21.42737579345703, "learning_rate": 3.7023809523809523e-07, "loss": 0.2689, "step": 289 }, { "epoch": 44.61538461538461, "grad_norm": 11.9132661819458, "learning_rate": 3.6904761904761906e-07, "loss": 0.3776, "step": 290 }, { "epoch": 44.76923076923077, "grad_norm": 14.86318302154541, "learning_rate": 3.678571428571429e-07, "loss": 0.3984, "step": 291 }, { "epoch": 44.92307692307692, "grad_norm": 15.56070327758789, "learning_rate": 3.666666666666666e-07, "loss": 0.3164, "step": 292 }, { "epoch": 44.92307692307692, "eval_accuracy": 0.8571428571428571, "eval_loss": 0.4591527581214905, "eval_runtime": 1.1076, "eval_samples_per_second": 246.479, "eval_steps_per_second": 4.514, "step": 292 }, { "epoch": 45.07692307692308, "grad_norm": 14.275500297546387, "learning_rate": 3.6547619047619045e-07, "loss": 0.2905, "step": 293 }, { "epoch": 45.23076923076923, "grad_norm": 11.206035614013672, "learning_rate": 3.642857142857143e-07, "loss": 0.2992, "step": 294 }, { "epoch": 45.38461538461539, "grad_norm": 10.007750511169434, "learning_rate": 3.6309523809523805e-07, "loss": 0.3662, "step": 295 }, { "epoch": 45.53846153846154, "grad_norm": 13.443836212158203, "learning_rate": 3.619047619047619e-07, "loss": 0.3129, "step": 296 }, { "epoch": 45.69230769230769, "grad_norm": 22.08678436279297, "learning_rate": 3.607142857142857e-07, "loss": 0.3904, "step": 297 }, { "epoch": 45.84615384615385, "grad_norm": 24.27136993408203, "learning_rate": 3.595238095238095e-07, "loss": 0.3193, "step": 298 }, { "epoch": 46.0, "grad_norm": 9.215178489685059, "learning_rate": 3.583333333333333e-07, "loss": 0.3657, "step": 299 }, { "epoch": 46.0, "eval_accuracy": 0.8608058608058609, "eval_loss": 0.4533578157424927, "eval_runtime": 1.1346, "eval_samples_per_second": 240.624, "eval_steps_per_second": 4.407, "step": 299 }, { "epoch": 46.15384615384615, "grad_norm": 13.869656562805176, "learning_rate": 3.5714285714285716e-07, "loss": 0.3241, "step": 300 }, { "epoch": 46.30769230769231, "grad_norm": 13.870816230773926, "learning_rate": 3.5595238095238094e-07, "loss": 0.294, "step": 301 }, { "epoch": 46.46153846153846, "grad_norm": 5.2440338134765625, "learning_rate": 3.5476190476190477e-07, "loss": 0.3046, "step": 302 }, { "epoch": 46.61538461538461, "grad_norm": 11.387068748474121, "learning_rate": 3.535714285714286e-07, "loss": 0.4067, "step": 303 }, { "epoch": 46.76923076923077, "grad_norm": 10.643122673034668, "learning_rate": 3.523809523809524e-07, "loss": 0.3217, "step": 304 }, { "epoch": 46.92307692307692, "grad_norm": 21.845155715942383, "learning_rate": 3.5119047619047615e-07, "loss": 0.2655, "step": 305 }, { "epoch": 46.92307692307692, "eval_accuracy": 0.8644688644688645, "eval_loss": 0.4501632750034332, "eval_runtime": 1.1413, "eval_samples_per_second": 239.204, "eval_steps_per_second": 4.381, "step": 305 }, { "epoch": 47.07692307692308, "grad_norm": 12.1947603225708, "learning_rate": 3.5e-07, "loss": 0.2994, "step": 306 }, { "epoch": 47.23076923076923, "grad_norm": 21.314617156982422, "learning_rate": 3.4880952380952376e-07, "loss": 0.2751, "step": 307 }, { "epoch": 47.38461538461539, "grad_norm": 5.636424541473389, "learning_rate": 3.476190476190476e-07, "loss": 0.292, "step": 308 }, { "epoch": 47.53846153846154, "grad_norm": 11.540352821350098, "learning_rate": 3.464285714285714e-07, "loss": 0.3693, "step": 309 }, { "epoch": 47.69230769230769, "grad_norm": 9.435784339904785, "learning_rate": 3.452380952380952e-07, "loss": 0.334, "step": 310 }, { "epoch": 47.84615384615385, "grad_norm": 14.798314094543457, "learning_rate": 3.4404761904761903e-07, "loss": 0.2237, "step": 311 }, { "epoch": 48.0, "grad_norm": 10.26159954071045, "learning_rate": 3.4285714285714286e-07, "loss": 0.2981, "step": 312 }, { "epoch": 48.0, "eval_accuracy": 0.8644688644688645, "eval_loss": 0.4451758563518524, "eval_runtime": 1.2162, "eval_samples_per_second": 224.473, "eval_steps_per_second": 4.111, "step": 312 }, { "epoch": 48.15384615384615, "grad_norm": 19.33696937561035, "learning_rate": 3.4166666666666664e-07, "loss": 0.2414, "step": 313 }, { "epoch": 48.30769230769231, "grad_norm": 8.318500518798828, "learning_rate": 3.4047619047619047e-07, "loss": 0.3193, "step": 314 }, { "epoch": 48.46153846153846, "grad_norm": 19.92133140563965, "learning_rate": 3.392857142857143e-07, "loss": 0.3218, "step": 315 }, { "epoch": 48.61538461538461, "grad_norm": 18.465848922729492, "learning_rate": 3.380952380952381e-07, "loss": 0.3046, "step": 316 }, { "epoch": 48.76923076923077, "grad_norm": 20.254159927368164, "learning_rate": 3.369047619047619e-07, "loss": 0.3651, "step": 317 }, { "epoch": 48.92307692307692, "grad_norm": 9.707018852233887, "learning_rate": 3.357142857142857e-07, "loss": 0.3508, "step": 318 }, { "epoch": 48.92307692307692, "eval_accuracy": 0.8791208791208791, "eval_loss": 0.4371393620967865, "eval_runtime": 1.2035, "eval_samples_per_second": 226.845, "eval_steps_per_second": 4.155, "step": 318 }, { "epoch": 49.07692307692308, "grad_norm": 9.588353157043457, "learning_rate": 3.3452380952380947e-07, "loss": 0.4015, "step": 319 }, { "epoch": 49.23076923076923, "grad_norm": 19.106985092163086, "learning_rate": 3.333333333333333e-07, "loss": 0.2461, "step": 320 }, { "epoch": 49.38461538461539, "grad_norm": 18.668371200561523, "learning_rate": 3.3214285714285713e-07, "loss": 0.3201, "step": 321 }, { "epoch": 49.53846153846154, "grad_norm": 22.97618865966797, "learning_rate": 3.309523809523809e-07, "loss": 0.2973, "step": 322 }, { "epoch": 49.69230769230769, "grad_norm": 15.057040214538574, "learning_rate": 3.2976190476190474e-07, "loss": 0.2931, "step": 323 }, { "epoch": 49.84615384615385, "grad_norm": 9.635611534118652, "learning_rate": 3.2857142857142857e-07, "loss": 0.3173, "step": 324 }, { "epoch": 50.0, "grad_norm": 8.364943504333496, "learning_rate": 3.2738095238095235e-07, "loss": 0.3419, "step": 325 }, { "epoch": 50.0, "eval_accuracy": 0.8754578754578755, "eval_loss": 0.43940499424934387, "eval_runtime": 1.1837, "eval_samples_per_second": 230.634, "eval_steps_per_second": 4.224, "step": 325 }, { "epoch": 50.15384615384615, "grad_norm": 10.205245018005371, "learning_rate": 3.261904761904762e-07, "loss": 0.3744, "step": 326 }, { "epoch": 50.30769230769231, "grad_norm": 8.429767608642578, "learning_rate": 3.25e-07, "loss": 0.2852, "step": 327 }, { "epoch": 50.46153846153846, "grad_norm": 19.509811401367188, "learning_rate": 3.238095238095238e-07, "loss": 0.283, "step": 328 }, { "epoch": 50.61538461538461, "grad_norm": 12.072210311889648, "learning_rate": 3.226190476190476e-07, "loss": 0.2955, "step": 329 }, { "epoch": 50.76923076923077, "grad_norm": 19.032461166381836, "learning_rate": 3.2142857142857145e-07, "loss": 0.351, "step": 330 }, { "epoch": 50.92307692307692, "grad_norm": 24.8001708984375, "learning_rate": 3.202380952380952e-07, "loss": 0.2668, "step": 331 }, { "epoch": 50.92307692307692, "eval_accuracy": 0.8754578754578755, "eval_loss": 0.4430113434791565, "eval_runtime": 1.1828, "eval_samples_per_second": 230.809, "eval_steps_per_second": 4.227, "step": 331 }, { "epoch": 51.07692307692308, "grad_norm": 7.619977951049805, "learning_rate": 3.19047619047619e-07, "loss": 0.2316, "step": 332 }, { "epoch": 51.23076923076923, "grad_norm": 15.534941673278809, "learning_rate": 3.1785714285714284e-07, "loss": 0.2948, "step": 333 }, { "epoch": 51.38461538461539, "grad_norm": 6.369411945343018, "learning_rate": 3.166666666666666e-07, "loss": 0.2319, "step": 334 }, { "epoch": 51.53846153846154, "grad_norm": 12.510886192321777, "learning_rate": 3.1547619047619045e-07, "loss": 0.3389, "step": 335 }, { "epoch": 51.69230769230769, "grad_norm": 9.731184005737305, "learning_rate": 3.142857142857143e-07, "loss": 0.2822, "step": 336 }, { "epoch": 51.84615384615385, "grad_norm": 6.847411155700684, "learning_rate": 3.1309523809523806e-07, "loss": 0.3447, "step": 337 }, { "epoch": 52.0, "grad_norm": 15.400504112243652, "learning_rate": 3.119047619047619e-07, "loss": 0.2972, "step": 338 }, { "epoch": 52.0, "eval_accuracy": 0.8717948717948718, "eval_loss": 0.43954789638519287, "eval_runtime": 1.1577, "eval_samples_per_second": 235.817, "eval_steps_per_second": 4.319, "step": 338 }, { "epoch": 52.15384615384615, "grad_norm": 14.40497875213623, "learning_rate": 3.107142857142857e-07, "loss": 0.2947, "step": 339 }, { "epoch": 52.30769230769231, "grad_norm": 12.60912799835205, "learning_rate": 3.095238095238095e-07, "loss": 0.2866, "step": 340 }, { "epoch": 52.46153846153846, "grad_norm": 10.782893180847168, "learning_rate": 3.0833333333333333e-07, "loss": 0.275, "step": 341 }, { "epoch": 52.61538461538461, "grad_norm": 14.848359107971191, "learning_rate": 3.0714285714285716e-07, "loss": 0.3377, "step": 342 }, { "epoch": 52.76923076923077, "grad_norm": 16.875308990478516, "learning_rate": 3.0595238095238094e-07, "loss": 0.2871, "step": 343 }, { "epoch": 52.92307692307692, "grad_norm": 10.62590217590332, "learning_rate": 3.0476190476190477e-07, "loss": 0.3514, "step": 344 }, { "epoch": 52.92307692307692, "eval_accuracy": 0.8754578754578755, "eval_loss": 0.437090665102005, "eval_runtime": 1.1411, "eval_samples_per_second": 239.24, "eval_steps_per_second": 4.382, "step": 344 }, { "epoch": 53.07692307692308, "grad_norm": 19.662609100341797, "learning_rate": 3.0357142857142855e-07, "loss": 0.2457, "step": 345 }, { "epoch": 53.23076923076923, "grad_norm": 10.951351165771484, "learning_rate": 3.023809523809523e-07, "loss": 0.2542, "step": 346 }, { "epoch": 53.38461538461539, "grad_norm": 6.810473918914795, "learning_rate": 3.0119047619047616e-07, "loss": 0.2873, "step": 347 }, { "epoch": 53.53846153846154, "grad_norm": 11.747807502746582, "learning_rate": 3e-07, "loss": 0.3965, "step": 348 }, { "epoch": 53.69230769230769, "grad_norm": 12.671740531921387, "learning_rate": 2.9880952380952376e-07, "loss": 0.3395, "step": 349 }, { "epoch": 53.84615384615385, "grad_norm": 10.57718276977539, "learning_rate": 2.976190476190476e-07, "loss": 0.3071, "step": 350 }, { "epoch": 54.0, "grad_norm": 7.219900131225586, "learning_rate": 2.9642857142857143e-07, "loss": 0.3012, "step": 351 }, { "epoch": 54.0, "eval_accuracy": 0.8791208791208791, "eval_loss": 0.43296605348587036, "eval_runtime": 1.1556, "eval_samples_per_second": 236.25, "eval_steps_per_second": 4.327, "step": 351 }, { "epoch": 54.15384615384615, "grad_norm": 7.588340759277344, "learning_rate": 2.952380952380952e-07, "loss": 0.3188, "step": 352 }, { "epoch": 54.30769230769231, "grad_norm": 20.165128707885742, "learning_rate": 2.9404761904761904e-07, "loss": 0.377, "step": 353 }, { "epoch": 54.46153846153846, "grad_norm": 9.232548713684082, "learning_rate": 2.9285714285714287e-07, "loss": 0.276, "step": 354 }, { "epoch": 54.61538461538461, "grad_norm": 8.916671752929688, "learning_rate": 2.916666666666667e-07, "loss": 0.3599, "step": 355 }, { "epoch": 54.76923076923077, "grad_norm": 11.789280891418457, "learning_rate": 2.904761904761905e-07, "loss": 0.3004, "step": 356 }, { "epoch": 54.92307692307692, "grad_norm": 8.527569770812988, "learning_rate": 2.892857142857143e-07, "loss": 0.2725, "step": 357 }, { "epoch": 54.92307692307692, "eval_accuracy": 0.8791208791208791, "eval_loss": 0.4297783672809601, "eval_runtime": 1.1219, "eval_samples_per_second": 243.346, "eval_steps_per_second": 4.457, "step": 357 }, { "epoch": 55.07692307692308, "grad_norm": 9.423765182495117, "learning_rate": 2.8809523809523803e-07, "loss": 0.2731, "step": 358 }, { "epoch": 55.23076923076923, "grad_norm": 13.653554916381836, "learning_rate": 2.8690476190476186e-07, "loss": 0.4064, "step": 359 }, { "epoch": 55.38461538461539, "grad_norm": 7.217813491821289, "learning_rate": 2.857142857142857e-07, "loss": 0.305, "step": 360 }, { "epoch": 55.53846153846154, "grad_norm": 7.166869640350342, "learning_rate": 2.845238095238095e-07, "loss": 0.2488, "step": 361 }, { "epoch": 55.69230769230769, "grad_norm": 6.370850086212158, "learning_rate": 2.833333333333333e-07, "loss": 0.2946, "step": 362 }, { "epoch": 55.84615384615385, "grad_norm": 19.36492919921875, "learning_rate": 2.8214285714285713e-07, "loss": 0.3314, "step": 363 }, { "epoch": 56.0, "grad_norm": 10.563324928283691, "learning_rate": 2.8095238095238096e-07, "loss": 0.2547, "step": 364 }, { "epoch": 56.0, "eval_accuracy": 0.8717948717948718, "eval_loss": 0.4288838803768158, "eval_runtime": 1.1679, "eval_samples_per_second": 233.746, "eval_steps_per_second": 4.281, "step": 364 }, { "epoch": 56.15384615384615, "grad_norm": 10.190791130065918, "learning_rate": 2.7976190476190474e-07, "loss": 0.2646, "step": 365 }, { "epoch": 56.30769230769231, "grad_norm": 16.170412063598633, "learning_rate": 2.785714285714286e-07, "loss": 0.3392, "step": 366 }, { "epoch": 56.46153846153846, "grad_norm": 7.313807964324951, "learning_rate": 2.773809523809524e-07, "loss": 0.2909, "step": 367 }, { "epoch": 56.61538461538461, "grad_norm": 9.653914451599121, "learning_rate": 2.761904761904762e-07, "loss": 0.3295, "step": 368 }, { "epoch": 56.76923076923077, "grad_norm": 6.966893672943115, "learning_rate": 2.75e-07, "loss": 0.2956, "step": 369 }, { "epoch": 56.92307692307692, "grad_norm": 15.753593444824219, "learning_rate": 2.7380952380952385e-07, "loss": 0.2896, "step": 370 }, { "epoch": 56.92307692307692, "eval_accuracy": 0.8717948717948718, "eval_loss": 0.4282112419605255, "eval_runtime": 1.1522, "eval_samples_per_second": 236.937, "eval_steps_per_second": 4.339, "step": 370 }, { "epoch": 57.07692307692308, "grad_norm": 13.775495529174805, "learning_rate": 2.7261904761904757e-07, "loss": 0.386, "step": 371 }, { "epoch": 57.23076923076923, "grad_norm": 6.003649711608887, "learning_rate": 2.714285714285714e-07, "loss": 0.2473, "step": 372 }, { "epoch": 57.38461538461539, "grad_norm": 22.614078521728516, "learning_rate": 2.7023809523809523e-07, "loss": 0.4098, "step": 373 }, { "epoch": 57.53846153846154, "grad_norm": 15.326905250549316, "learning_rate": 2.69047619047619e-07, "loss": 0.2628, "step": 374 }, { "epoch": 57.69230769230769, "grad_norm": 6.482524871826172, "learning_rate": 2.6785714285714284e-07, "loss": 0.3312, "step": 375 }, { "epoch": 57.84615384615385, "grad_norm": 19.687318801879883, "learning_rate": 2.6666666666666667e-07, "loss": 0.3625, "step": 376 }, { "epoch": 58.0, "grad_norm": 11.662421226501465, "learning_rate": 2.6547619047619045e-07, "loss": 0.3469, "step": 377 }, { "epoch": 58.0, "eval_accuracy": 0.8717948717948718, "eval_loss": 0.4272923469543457, "eval_runtime": 1.1242, "eval_samples_per_second": 242.848, "eval_steps_per_second": 4.448, "step": 377 }, { "epoch": 58.15384615384615, "grad_norm": 8.805383682250977, "learning_rate": 2.642857142857143e-07, "loss": 0.3987, "step": 378 }, { "epoch": 58.30769230769231, "grad_norm": 11.661012649536133, "learning_rate": 2.630952380952381e-07, "loss": 0.2147, "step": 379 }, { "epoch": 58.46153846153846, "grad_norm": 12.969446182250977, "learning_rate": 2.619047619047619e-07, "loss": 0.3074, "step": 380 }, { "epoch": 58.61538461538461, "grad_norm": 9.435002326965332, "learning_rate": 2.607142857142857e-07, "loss": 0.2626, "step": 381 }, { "epoch": 58.76923076923077, "grad_norm": 14.217181205749512, "learning_rate": 2.5952380952380955e-07, "loss": 0.2264, "step": 382 }, { "epoch": 58.92307692307692, "grad_norm": 13.90890884399414, "learning_rate": 2.5833333333333333e-07, "loss": 0.3528, "step": 383 }, { "epoch": 58.92307692307692, "eval_accuracy": 0.8717948717948718, "eval_loss": 0.4268935024738312, "eval_runtime": 1.1071, "eval_samples_per_second": 246.586, "eval_steps_per_second": 4.516, "step": 383 }, { "epoch": 59.07692307692308, "grad_norm": 6.627048492431641, "learning_rate": 2.571428571428571e-07, "loss": 0.2552, "step": 384 }, { "epoch": 59.23076923076923, "grad_norm": 14.275843620300293, "learning_rate": 2.5595238095238094e-07, "loss": 0.2876, "step": 385 }, { "epoch": 59.38461538461539, "grad_norm": 15.865604400634766, "learning_rate": 2.547619047619047e-07, "loss": 0.3701, "step": 386 }, { "epoch": 59.53846153846154, "grad_norm": 12.051728248596191, "learning_rate": 2.5357142857142855e-07, "loss": 0.2598, "step": 387 }, { "epoch": 59.69230769230769, "grad_norm": 11.886255264282227, "learning_rate": 2.523809523809524e-07, "loss": 0.238, "step": 388 }, { "epoch": 59.84615384615385, "grad_norm": 12.55905818939209, "learning_rate": 2.5119047619047616e-07, "loss": 0.2893, "step": 389 }, { "epoch": 60.0, "grad_norm": 18.840225219726562, "learning_rate": 2.5e-07, "loss": 0.2552, "step": 390 }, { "epoch": 60.0, "eval_accuracy": 0.8681318681318682, "eval_loss": 0.4324240982532501, "eval_runtime": 1.1584, "eval_samples_per_second": 235.676, "eval_steps_per_second": 4.316, "step": 390 }, { "epoch": 60.15384615384615, "grad_norm": 7.739254474639893, "learning_rate": 2.488095238095238e-07, "loss": 0.3825, "step": 391 }, { "epoch": 60.30769230769231, "grad_norm": 27.177335739135742, "learning_rate": 2.476190476190476e-07, "loss": 0.3577, "step": 392 }, { "epoch": 60.46153846153846, "grad_norm": 8.35522747039795, "learning_rate": 2.4642857142857143e-07, "loss": 0.296, "step": 393 }, { "epoch": 60.61538461538461, "grad_norm": 12.505022048950195, "learning_rate": 2.452380952380952e-07, "loss": 0.2922, "step": 394 }, { "epoch": 60.76923076923077, "grad_norm": 8.860665321350098, "learning_rate": 2.4404761904761904e-07, "loss": 0.3924, "step": 395 }, { "epoch": 60.92307692307692, "grad_norm": 13.714400291442871, "learning_rate": 2.4285714285714287e-07, "loss": 0.239, "step": 396 }, { "epoch": 60.92307692307692, "eval_accuracy": 0.8644688644688645, "eval_loss": 0.4319455325603485, "eval_runtime": 1.158, "eval_samples_per_second": 235.758, "eval_steps_per_second": 4.318, "step": 396 }, { "epoch": 61.07692307692308, "grad_norm": 11.82459831237793, "learning_rate": 2.4166666666666665e-07, "loss": 0.2555, "step": 397 }, { "epoch": 61.23076923076923, "grad_norm": 13.206809997558594, "learning_rate": 2.404761904761905e-07, "loss": 0.3353, "step": 398 }, { "epoch": 61.38461538461539, "grad_norm": 11.129719734191895, "learning_rate": 2.392857142857143e-07, "loss": 0.313, "step": 399 }, { "epoch": 61.53846153846154, "grad_norm": 19.73814582824707, "learning_rate": 2.3809523809523806e-07, "loss": 0.2591, "step": 400 }, { "epoch": 61.69230769230769, "grad_norm": 10.5856351852417, "learning_rate": 2.369047619047619e-07, "loss": 0.3416, "step": 401 }, { "epoch": 61.84615384615385, "grad_norm": 12.684653282165527, "learning_rate": 2.357142857142857e-07, "loss": 0.2315, "step": 402 }, { "epoch": 62.0, "grad_norm": 8.558398246765137, "learning_rate": 2.345238095238095e-07, "loss": 0.3321, "step": 403 }, { "epoch": 62.0, "eval_accuracy": 0.8717948717948718, "eval_loss": 0.42702218890190125, "eval_runtime": 1.1757, "eval_samples_per_second": 232.2, "eval_steps_per_second": 4.253, "step": 403 }, { "epoch": 62.15384615384615, "grad_norm": 9.598026275634766, "learning_rate": 2.3333333333333333e-07, "loss": 0.3119, "step": 404 }, { "epoch": 62.30769230769231, "grad_norm": 13.107952117919922, "learning_rate": 2.3214285714285714e-07, "loss": 0.3379, "step": 405 }, { "epoch": 62.46153846153846, "grad_norm": 18.639419555664062, "learning_rate": 2.3095238095238097e-07, "loss": 0.2689, "step": 406 }, { "epoch": 62.61538461538461, "grad_norm": 17.175498962402344, "learning_rate": 2.2976190476190475e-07, "loss": 0.3154, "step": 407 }, { "epoch": 62.76923076923077, "grad_norm": 11.377558708190918, "learning_rate": 2.2857142857142855e-07, "loss": 0.2969, "step": 408 }, { "epoch": 62.92307692307692, "grad_norm": 12.16077709197998, "learning_rate": 2.2738095238095238e-07, "loss": 0.3115, "step": 409 }, { "epoch": 62.92307692307692, "eval_accuracy": 0.8717948717948718, "eval_loss": 0.41838309168815613, "eval_runtime": 1.133, "eval_samples_per_second": 240.951, "eval_steps_per_second": 4.413, "step": 409 }, { "epoch": 63.07692307692308, "grad_norm": 29.19352912902832, "learning_rate": 2.2619047619047619e-07, "loss": 0.327, "step": 410 }, { "epoch": 63.23076923076923, "grad_norm": 21.762849807739258, "learning_rate": 2.25e-07, "loss": 0.3142, "step": 411 }, { "epoch": 63.38461538461539, "grad_norm": 20.668453216552734, "learning_rate": 2.2380952380952382e-07, "loss": 0.2705, "step": 412 }, { "epoch": 63.53846153846154, "grad_norm": 10.485206604003906, "learning_rate": 2.226190476190476e-07, "loss": 0.2635, "step": 413 }, { "epoch": 63.69230769230769, "grad_norm": 5.819842338562012, "learning_rate": 2.214285714285714e-07, "loss": 0.281, "step": 414 }, { "epoch": 63.84615384615385, "grad_norm": 8.578193664550781, "learning_rate": 2.2023809523809523e-07, "loss": 0.2981, "step": 415 }, { "epoch": 64.0, "grad_norm": 14.02076244354248, "learning_rate": 2.1904761904761904e-07, "loss": 0.306, "step": 416 }, { "epoch": 64.0, "eval_accuracy": 0.8717948717948718, "eval_loss": 0.4168645739555359, "eval_runtime": 1.1838, "eval_samples_per_second": 230.615, "eval_steps_per_second": 4.224, "step": 416 }, { "epoch": 64.15384615384616, "grad_norm": 8.580323219299316, "learning_rate": 2.1785714285714284e-07, "loss": 0.2481, "step": 417 }, { "epoch": 64.3076923076923, "grad_norm": 12.701449394226074, "learning_rate": 2.1666666666666667e-07, "loss": 0.2978, "step": 418 }, { "epoch": 64.46153846153847, "grad_norm": 7.2950544357299805, "learning_rate": 2.1547619047619048e-07, "loss": 0.2778, "step": 419 }, { "epoch": 64.61538461538461, "grad_norm": 14.55117416381836, "learning_rate": 2.1428571428571426e-07, "loss": 0.3589, "step": 420 }, { "epoch": 64.76923076923077, "grad_norm": 15.672528266906738, "learning_rate": 2.130952380952381e-07, "loss": 0.3638, "step": 421 }, { "epoch": 64.92307692307692, "grad_norm": 8.848112106323242, "learning_rate": 2.119047619047619e-07, "loss": 0.3086, "step": 422 }, { "epoch": 64.92307692307692, "eval_accuracy": 0.8717948717948718, "eval_loss": 0.41758179664611816, "eval_runtime": 1.1479, "eval_samples_per_second": 237.817, "eval_steps_per_second": 4.356, "step": 422 }, { "epoch": 65.07692307692308, "grad_norm": 20.250051498413086, "learning_rate": 2.107142857142857e-07, "loss": 0.2965, "step": 423 }, { "epoch": 65.23076923076923, "grad_norm": 8.315861701965332, "learning_rate": 2.0952380952380953e-07, "loss": 0.3119, "step": 424 }, { "epoch": 65.38461538461539, "grad_norm": 7.086258888244629, "learning_rate": 2.0833333333333333e-07, "loss": 0.2904, "step": 425 }, { "epoch": 65.53846153846153, "grad_norm": 9.35214900970459, "learning_rate": 2.0714285714285714e-07, "loss": 0.2787, "step": 426 }, { "epoch": 65.6923076923077, "grad_norm": 11.061731338500977, "learning_rate": 2.0595238095238094e-07, "loss": 0.287, "step": 427 }, { "epoch": 65.84615384615384, "grad_norm": 17.736583709716797, "learning_rate": 2.0476190476190475e-07, "loss": 0.3101, "step": 428 }, { "epoch": 66.0, "grad_norm": 20.51115608215332, "learning_rate": 2.0357142857142855e-07, "loss": 0.4256, "step": 429 }, { "epoch": 66.0, "eval_accuracy": 0.8717948717948718, "eval_loss": 0.4195899963378906, "eval_runtime": 1.2137, "eval_samples_per_second": 224.925, "eval_steps_per_second": 4.119, "step": 429 }, { "epoch": 66.15384615384616, "grad_norm": 7.543363094329834, "learning_rate": 2.0238095238095238e-07, "loss": 0.1944, "step": 430 }, { "epoch": 66.3076923076923, "grad_norm": 15.504809379577637, "learning_rate": 2.0119047619047619e-07, "loss": 0.2767, "step": 431 }, { "epoch": 66.46153846153847, "grad_norm": 12.454192161560059, "learning_rate": 2e-07, "loss": 0.2697, "step": 432 }, { "epoch": 66.61538461538461, "grad_norm": 11.19575023651123, "learning_rate": 1.988095238095238e-07, "loss": 0.2626, "step": 433 }, { "epoch": 66.76923076923077, "grad_norm": 8.080245971679688, "learning_rate": 1.976190476190476e-07, "loss": 0.3262, "step": 434 }, { "epoch": 66.92307692307692, "grad_norm": 16.002864837646484, "learning_rate": 1.964285714285714e-07, "loss": 0.2798, "step": 435 }, { "epoch": 66.92307692307692, "eval_accuracy": 0.8717948717948718, "eval_loss": 0.4219285249710083, "eval_runtime": 1.1309, "eval_samples_per_second": 241.401, "eval_steps_per_second": 4.421, "step": 435 }, { "epoch": 67.07692307692308, "grad_norm": 9.127429008483887, "learning_rate": 1.9523809523809524e-07, "loss": 0.2828, "step": 436 }, { "epoch": 67.23076923076923, "grad_norm": 9.812334060668945, "learning_rate": 1.9404761904761904e-07, "loss": 0.3028, "step": 437 }, { "epoch": 67.38461538461539, "grad_norm": 16.504629135131836, "learning_rate": 1.9285714285714284e-07, "loss": 0.2382, "step": 438 }, { "epoch": 67.53846153846153, "grad_norm": 15.225912094116211, "learning_rate": 1.9166666666666668e-07, "loss": 0.3387, "step": 439 }, { "epoch": 67.6923076923077, "grad_norm": 32.0360221862793, "learning_rate": 1.9047619047619045e-07, "loss": 0.252, "step": 440 }, { "epoch": 67.84615384615384, "grad_norm": 10.604074478149414, "learning_rate": 1.8928571428571426e-07, "loss": 0.3423, "step": 441 }, { "epoch": 68.0, "grad_norm": 11.707740783691406, "learning_rate": 1.880952380952381e-07, "loss": 0.3016, "step": 442 }, { "epoch": 68.0, "eval_accuracy": 0.8717948717948718, "eval_loss": 0.4223931133747101, "eval_runtime": 1.1901, "eval_samples_per_second": 229.39, "eval_steps_per_second": 4.201, "step": 442 }, { "epoch": 68.15384615384616, "grad_norm": 15.720952987670898, "learning_rate": 1.869047619047619e-07, "loss": 0.2488, "step": 443 }, { "epoch": 68.3076923076923, "grad_norm": 21.468849182128906, "learning_rate": 1.8571428571428572e-07, "loss": 0.3655, "step": 444 }, { "epoch": 68.46153846153847, "grad_norm": 7.170112133026123, "learning_rate": 1.8452380952380953e-07, "loss": 0.2293, "step": 445 }, { "epoch": 68.61538461538461, "grad_norm": 13.677178382873535, "learning_rate": 1.833333333333333e-07, "loss": 0.3512, "step": 446 }, { "epoch": 68.76923076923077, "grad_norm": 15.128756523132324, "learning_rate": 1.8214285714285714e-07, "loss": 0.3291, "step": 447 }, { "epoch": 68.92307692307692, "grad_norm": 9.91322135925293, "learning_rate": 1.8095238095238094e-07, "loss": 0.2791, "step": 448 }, { "epoch": 68.92307692307692, "eval_accuracy": 0.8717948717948718, "eval_loss": 0.42070242762565613, "eval_runtime": 1.1329, "eval_samples_per_second": 240.966, "eval_steps_per_second": 4.413, "step": 448 }, { "epoch": 69.07692307692308, "grad_norm": 18.428592681884766, "learning_rate": 1.7976190476190475e-07, "loss": 0.2857, "step": 449 }, { "epoch": 69.23076923076923, "grad_norm": 13.563444137573242, "learning_rate": 1.7857142857142858e-07, "loss": 0.3579, "step": 450 }, { "epoch": 69.38461538461539, "grad_norm": 7.071059226989746, "learning_rate": 1.7738095238095238e-07, "loss": 0.288, "step": 451 }, { "epoch": 69.53846153846153, "grad_norm": 13.733859062194824, "learning_rate": 1.761904761904762e-07, "loss": 0.3107, "step": 452 }, { "epoch": 69.6923076923077, "grad_norm": 24.475296020507812, "learning_rate": 1.75e-07, "loss": 0.3036, "step": 453 }, { "epoch": 69.84615384615384, "grad_norm": 8.03947925567627, "learning_rate": 1.738095238095238e-07, "loss": 0.3046, "step": 454 }, { "epoch": 70.0, "grad_norm": 12.644840240478516, "learning_rate": 1.726190476190476e-07, "loss": 0.2651, "step": 455 }, { "epoch": 70.0, "eval_accuracy": 0.8717948717948718, "eval_loss": 0.4188561737537384, "eval_runtime": 1.1824, "eval_samples_per_second": 230.892, "eval_steps_per_second": 4.229, "step": 455 }, { "epoch": 70.15384615384616, "grad_norm": 10.41529655456543, "learning_rate": 1.7142857142857143e-07, "loss": 0.2741, "step": 456 }, { "epoch": 70.3076923076923, "grad_norm": 13.780343055725098, "learning_rate": 1.7023809523809524e-07, "loss": 0.1817, "step": 457 }, { "epoch": 70.46153846153847, "grad_norm": 15.142984390258789, "learning_rate": 1.6904761904761904e-07, "loss": 0.2511, "step": 458 }, { "epoch": 70.61538461538461, "grad_norm": 11.113754272460938, "learning_rate": 1.6785714285714285e-07, "loss": 0.3617, "step": 459 }, { "epoch": 70.76923076923077, "grad_norm": 9.247007369995117, "learning_rate": 1.6666666666666665e-07, "loss": 0.3423, "step": 460 }, { "epoch": 70.92307692307692, "grad_norm": 10.317591667175293, "learning_rate": 1.6547619047619045e-07, "loss": 0.2466, "step": 461 }, { "epoch": 70.92307692307692, "eval_accuracy": 0.8717948717948718, "eval_loss": 0.4177640378475189, "eval_runtime": 1.1382, "eval_samples_per_second": 239.846, "eval_steps_per_second": 4.393, "step": 461 }, { "epoch": 71.07692307692308, "grad_norm": 12.4044771194458, "learning_rate": 1.6428571428571429e-07, "loss": 0.274, "step": 462 }, { "epoch": 71.23076923076923, "grad_norm": 12.682540893554688, "learning_rate": 1.630952380952381e-07, "loss": 0.3048, "step": 463 }, { "epoch": 71.38461538461539, "grad_norm": 25.84153175354004, "learning_rate": 1.619047619047619e-07, "loss": 0.2463, "step": 464 }, { "epoch": 71.53846153846153, "grad_norm": 13.235491752624512, "learning_rate": 1.6071428571428573e-07, "loss": 0.415, "step": 465 }, { "epoch": 71.6923076923077, "grad_norm": 6.873939514160156, "learning_rate": 1.595238095238095e-07, "loss": 0.3163, "step": 466 }, { "epoch": 71.84615384615384, "grad_norm": 16.569108963012695, "learning_rate": 1.583333333333333e-07, "loss": 0.3067, "step": 467 }, { "epoch": 72.0, "grad_norm": 18.71702003479004, "learning_rate": 1.5714285714285714e-07, "loss": 0.1913, "step": 468 }, { "epoch": 72.0, "eval_accuracy": 0.8717948717948718, "eval_loss": 0.4177253544330597, "eval_runtime": 1.1798, "eval_samples_per_second": 231.388, "eval_steps_per_second": 4.238, "step": 468 }, { "epoch": 72.15384615384616, "grad_norm": 21.70823097229004, "learning_rate": 1.5595238095238094e-07, "loss": 0.3477, "step": 469 }, { "epoch": 72.3076923076923, "grad_norm": 18.4373779296875, "learning_rate": 1.5476190476190475e-07, "loss": 0.3226, "step": 470 }, { "epoch": 72.46153846153847, "grad_norm": 12.795662879943848, "learning_rate": 1.5357142857142858e-07, "loss": 0.36, "step": 471 }, { "epoch": 72.61538461538461, "grad_norm": 6.41522741317749, "learning_rate": 1.5238095238095238e-07, "loss": 0.2615, "step": 472 }, { "epoch": 72.76923076923077, "grad_norm": 11.777397155761719, "learning_rate": 1.5119047619047616e-07, "loss": 0.2648, "step": 473 }, { "epoch": 72.92307692307692, "grad_norm": 14.508996963500977, "learning_rate": 1.5e-07, "loss": 0.2719, "step": 474 }, { "epoch": 72.92307692307692, "eval_accuracy": 0.8717948717948718, "eval_loss": 0.4163550138473511, "eval_runtime": 1.1181, "eval_samples_per_second": 244.158, "eval_steps_per_second": 4.472, "step": 474 }, { "epoch": 73.07692307692308, "grad_norm": 15.088624000549316, "learning_rate": 1.488095238095238e-07, "loss": 0.3153, "step": 475 }, { "epoch": 73.23076923076923, "grad_norm": 6.610665798187256, "learning_rate": 1.476190476190476e-07, "loss": 0.2292, "step": 476 }, { "epoch": 73.38461538461539, "grad_norm": 19.090049743652344, "learning_rate": 1.4642857142857143e-07, "loss": 0.2765, "step": 477 }, { "epoch": 73.53846153846153, "grad_norm": 19.875932693481445, "learning_rate": 1.4523809523809524e-07, "loss": 0.2797, "step": 478 }, { "epoch": 73.6923076923077, "grad_norm": 21.46002960205078, "learning_rate": 1.4404761904761902e-07, "loss": 0.2846, "step": 479 }, { "epoch": 73.84615384615384, "grad_norm": 9.745357513427734, "learning_rate": 1.4285714285714285e-07, "loss": 0.3138, "step": 480 }, { "epoch": 74.0, "grad_norm": 9.963276863098145, "learning_rate": 1.4166666666666665e-07, "loss": 0.3364, "step": 481 }, { "epoch": 74.0, "eval_accuracy": 0.8717948717948718, "eval_loss": 0.41662341356277466, "eval_runtime": 1.1761, "eval_samples_per_second": 232.123, "eval_steps_per_second": 4.251, "step": 481 }, { "epoch": 74.15384615384616, "grad_norm": 12.587307929992676, "learning_rate": 1.4047619047619048e-07, "loss": 0.3476, "step": 482 }, { "epoch": 74.3076923076923, "grad_norm": 15.544804573059082, "learning_rate": 1.392857142857143e-07, "loss": 0.298, "step": 483 }, { "epoch": 74.46153846153847, "grad_norm": 16.263813018798828, "learning_rate": 1.380952380952381e-07, "loss": 0.3103, "step": 484 }, { "epoch": 74.61538461538461, "grad_norm": 15.350561141967773, "learning_rate": 1.3690476190476192e-07, "loss": 0.3083, "step": 485 }, { "epoch": 74.76923076923077, "grad_norm": 18.922351837158203, "learning_rate": 1.357142857142857e-07, "loss": 0.2371, "step": 486 }, { "epoch": 74.92307692307692, "grad_norm": 10.000033378601074, "learning_rate": 1.345238095238095e-07, "loss": 0.283, "step": 487 }, { "epoch": 74.92307692307692, "eval_accuracy": 0.8754578754578755, "eval_loss": 0.4179239273071289, "eval_runtime": 1.1425, "eval_samples_per_second": 238.952, "eval_steps_per_second": 4.376, "step": 487 }, { "epoch": 75.07692307692308, "grad_norm": 11.999493598937988, "learning_rate": 1.3333333333333334e-07, "loss": 0.2356, "step": 488 }, { "epoch": 75.23076923076923, "grad_norm": 15.124945640563965, "learning_rate": 1.3214285714285714e-07, "loss": 0.2473, "step": 489 }, { "epoch": 75.38461538461539, "grad_norm": 5.885743618011475, "learning_rate": 1.3095238095238095e-07, "loss": 0.2623, "step": 490 }, { "epoch": 75.53846153846153, "grad_norm": 17.72136116027832, "learning_rate": 1.2976190476190478e-07, "loss": 0.2667, "step": 491 }, { "epoch": 75.6923076923077, "grad_norm": 17.649593353271484, "learning_rate": 1.2857142857142855e-07, "loss": 0.3016, "step": 492 }, { "epoch": 75.84615384615384, "grad_norm": 15.614336013793945, "learning_rate": 1.2738095238095236e-07, "loss": 0.2771, "step": 493 }, { "epoch": 76.0, "grad_norm": 6.446508884429932, "learning_rate": 1.261904761904762e-07, "loss": 0.2891, "step": 494 }, { "epoch": 76.0, "eval_accuracy": 0.8754578754578755, "eval_loss": 0.4174346923828125, "eval_runtime": 1.1254, "eval_samples_per_second": 242.591, "eval_steps_per_second": 4.443, "step": 494 }, { "epoch": 76.15384615384616, "grad_norm": 8.419719696044922, "learning_rate": 1.25e-07, "loss": 0.2639, "step": 495 }, { "epoch": 76.3076923076923, "grad_norm": 8.500795364379883, "learning_rate": 1.238095238095238e-07, "loss": 0.2902, "step": 496 }, { "epoch": 76.46153846153847, "grad_norm": 9.533052444458008, "learning_rate": 1.226190476190476e-07, "loss": 0.3342, "step": 497 }, { "epoch": 76.61538461538461, "grad_norm": 32.33898162841797, "learning_rate": 1.2142857142857143e-07, "loss": 0.3059, "step": 498 }, { "epoch": 76.76923076923077, "grad_norm": 14.258476257324219, "learning_rate": 1.2023809523809524e-07, "loss": 0.249, "step": 499 }, { "epoch": 76.92307692307692, "grad_norm": 28.383731842041016, "learning_rate": 1.1904761904761903e-07, "loss": 0.2625, "step": 500 }, { "epoch": 76.92307692307692, "eval_accuracy": 0.8754578754578755, "eval_loss": 0.4179657995700836, "eval_runtime": 1.0921, "eval_samples_per_second": 249.988, "eval_steps_per_second": 4.579, "step": 500 }, { "epoch": 77.07692307692308, "grad_norm": 14.453038215637207, "learning_rate": 1.1785714285714285e-07, "loss": 0.3501, "step": 501 }, { "epoch": 77.23076923076923, "grad_norm": 15.141488075256348, "learning_rate": 1.1666666666666667e-07, "loss": 0.2959, "step": 502 }, { "epoch": 77.38461538461539, "grad_norm": 27.595386505126953, "learning_rate": 1.1547619047619048e-07, "loss": 0.3883, "step": 503 }, { "epoch": 77.53846153846153, "grad_norm": 7.041699409484863, "learning_rate": 1.1428571428571427e-07, "loss": 0.2567, "step": 504 }, { "epoch": 77.6923076923077, "grad_norm": 5.878727912902832, "learning_rate": 1.1309523809523809e-07, "loss": 0.1855, "step": 505 }, { "epoch": 77.84615384615384, "grad_norm": 9.228104591369629, "learning_rate": 1.1190476190476191e-07, "loss": 0.2607, "step": 506 }, { "epoch": 78.0, "grad_norm": 10.124485969543457, "learning_rate": 1.107142857142857e-07, "loss": 0.2843, "step": 507 }, { "epoch": 78.0, "eval_accuracy": 0.8717948717948718, "eval_loss": 0.4184325635433197, "eval_runtime": 1.2364, "eval_samples_per_second": 220.81, "eval_steps_per_second": 4.044, "step": 507 }, { "epoch": 78.15384615384616, "grad_norm": 32.02544403076172, "learning_rate": 1.0952380952380952e-07, "loss": 0.2707, "step": 508 }, { "epoch": 78.3076923076923, "grad_norm": 8.309106826782227, "learning_rate": 1.0833333333333334e-07, "loss": 0.3501, "step": 509 }, { "epoch": 78.46153846153847, "grad_norm": 10.85938549041748, "learning_rate": 1.0714285714285713e-07, "loss": 0.3379, "step": 510 }, { "epoch": 78.61538461538461, "grad_norm": 14.523594856262207, "learning_rate": 1.0595238095238095e-07, "loss": 0.2191, "step": 511 }, { "epoch": 78.76923076923077, "grad_norm": 16.100353240966797, "learning_rate": 1.0476190476190476e-07, "loss": 0.3082, "step": 512 }, { "epoch": 78.92307692307692, "grad_norm": 10.894936561584473, "learning_rate": 1.0357142857142857e-07, "loss": 0.375, "step": 513 }, { "epoch": 78.92307692307692, "eval_accuracy": 0.8754578754578755, "eval_loss": 0.41671693325042725, "eval_runtime": 1.116, "eval_samples_per_second": 244.622, "eval_steps_per_second": 4.48, "step": 513 }, { "epoch": 79.07692307692308, "grad_norm": 10.38732624053955, "learning_rate": 1.0238095238095237e-07, "loss": 0.2812, "step": 514 }, { "epoch": 79.23076923076923, "grad_norm": 15.733484268188477, "learning_rate": 1.0119047619047619e-07, "loss": 0.3247, "step": 515 }, { "epoch": 79.38461538461539, "grad_norm": 6.772809028625488, "learning_rate": 1e-07, "loss": 0.221, "step": 516 }, { "epoch": 79.53846153846153, "grad_norm": 8.480406761169434, "learning_rate": 9.88095238095238e-08, "loss": 0.2817, "step": 517 }, { "epoch": 79.6923076923077, "grad_norm": 8.911038398742676, "learning_rate": 9.761904761904762e-08, "loss": 0.3514, "step": 518 }, { "epoch": 79.84615384615384, "grad_norm": 18.952770233154297, "learning_rate": 9.642857142857142e-08, "loss": 0.2881, "step": 519 }, { "epoch": 80.0, "grad_norm": 6.310261249542236, "learning_rate": 9.523809523809523e-08, "loss": 0.3107, "step": 520 }, { "epoch": 80.0, "eval_accuracy": 0.8754578754578755, "eval_loss": 0.41499239206314087, "eval_runtime": 1.191, "eval_samples_per_second": 229.225, "eval_steps_per_second": 4.198, "step": 520 }, { "epoch": 80.15384615384616, "grad_norm": 17.95909309387207, "learning_rate": 9.404761904761904e-08, "loss": 0.2551, "step": 521 }, { "epoch": 80.3076923076923, "grad_norm": 27.223407745361328, "learning_rate": 9.285714285714286e-08, "loss": 0.3356, "step": 522 }, { "epoch": 80.46153846153847, "grad_norm": 6.595218658447266, "learning_rate": 9.166666666666665e-08, "loss": 0.2383, "step": 523 }, { "epoch": 80.61538461538461, "grad_norm": 20.459001541137695, "learning_rate": 9.047619047619047e-08, "loss": 0.3265, "step": 524 }, { "epoch": 80.76923076923077, "grad_norm": 15.349759101867676, "learning_rate": 8.928571428571429e-08, "loss": 0.2763, "step": 525 }, { "epoch": 80.92307692307692, "grad_norm": 10.789344787597656, "learning_rate": 8.80952380952381e-08, "loss": 0.3742, "step": 526 }, { "epoch": 80.92307692307692, "eval_accuracy": 0.8717948717948718, "eval_loss": 0.4144986569881439, "eval_runtime": 1.1418, "eval_samples_per_second": 239.087, "eval_steps_per_second": 4.379, "step": 526 }, { "epoch": 81.07692307692308, "grad_norm": 13.075380325317383, "learning_rate": 8.69047619047619e-08, "loss": 0.2957, "step": 527 }, { "epoch": 81.23076923076923, "grad_norm": 10.450531959533691, "learning_rate": 8.571428571428572e-08, "loss": 0.3245, "step": 528 }, { "epoch": 81.38461538461539, "grad_norm": 20.416603088378906, "learning_rate": 8.452380952380952e-08, "loss": 0.2355, "step": 529 }, { "epoch": 81.53846153846153, "grad_norm": 11.954909324645996, "learning_rate": 8.333333333333333e-08, "loss": 0.2847, "step": 530 }, { "epoch": 81.6923076923077, "grad_norm": 10.024072647094727, "learning_rate": 8.214285714285714e-08, "loss": 0.3908, "step": 531 }, { "epoch": 81.84615384615384, "grad_norm": 13.764399528503418, "learning_rate": 8.095238095238095e-08, "loss": 0.2062, "step": 532 }, { "epoch": 82.0, "grad_norm": 18.268247604370117, "learning_rate": 7.976190476190475e-08, "loss": 0.2574, "step": 533 }, { "epoch": 82.0, "eval_accuracy": 0.8754578754578755, "eval_loss": 0.4145357608795166, "eval_runtime": 1.2255, "eval_samples_per_second": 222.772, "eval_steps_per_second": 4.08, "step": 533 }, { "epoch": 82.15384615384616, "grad_norm": 11.204299926757812, "learning_rate": 7.857142857142857e-08, "loss": 0.294, "step": 534 }, { "epoch": 82.3076923076923, "grad_norm": 12.602919578552246, "learning_rate": 7.738095238095237e-08, "loss": 0.3379, "step": 535 }, { "epoch": 82.46153846153847, "grad_norm": 17.001785278320312, "learning_rate": 7.619047619047619e-08, "loss": 0.2501, "step": 536 }, { "epoch": 82.61538461538461, "grad_norm": 10.472540855407715, "learning_rate": 7.5e-08, "loss": 0.2673, "step": 537 }, { "epoch": 82.76923076923077, "grad_norm": 12.93094539642334, "learning_rate": 7.38095238095238e-08, "loss": 0.3303, "step": 538 }, { "epoch": 82.92307692307692, "grad_norm": 15.572615623474121, "learning_rate": 7.261904761904762e-08, "loss": 0.329, "step": 539 }, { "epoch": 82.92307692307692, "eval_accuracy": 0.8754578754578755, "eval_loss": 0.41488900780677795, "eval_runtime": 1.1463, "eval_samples_per_second": 238.152, "eval_steps_per_second": 4.362, "step": 539 }, { "epoch": 83.07692307692308, "grad_norm": 8.833599090576172, "learning_rate": 7.142857142857142e-08, "loss": 0.2862, "step": 540 }, { "epoch": 83.23076923076923, "grad_norm": 7.227090358734131, "learning_rate": 7.023809523809524e-08, "loss": 0.2553, "step": 541 }, { "epoch": 83.38461538461539, "grad_norm": 16.44085693359375, "learning_rate": 6.904761904761905e-08, "loss": 0.3129, "step": 542 }, { "epoch": 83.53846153846153, "grad_norm": 13.633960723876953, "learning_rate": 6.785714285714285e-08, "loss": 0.3134, "step": 543 }, { "epoch": 83.6923076923077, "grad_norm": 16.555570602416992, "learning_rate": 6.666666666666667e-08, "loss": 0.2504, "step": 544 }, { "epoch": 83.84615384615384, "grad_norm": 7.340324878692627, "learning_rate": 6.547619047619047e-08, "loss": 0.2966, "step": 545 }, { "epoch": 84.0, "grad_norm": 10.442778587341309, "learning_rate": 6.428571428571428e-08, "loss": 0.2727, "step": 546 }, { "epoch": 84.0, "eval_accuracy": 0.8754578754578755, "eval_loss": 0.4145146608352661, "eval_runtime": 1.1279, "eval_samples_per_second": 242.042, "eval_steps_per_second": 4.433, "step": 546 }, { "epoch": 84.15384615384616, "grad_norm": 9.072919845581055, "learning_rate": 6.30952380952381e-08, "loss": 0.2461, "step": 547 }, { "epoch": 84.3076923076923, "grad_norm": 8.624760627746582, "learning_rate": 6.19047619047619e-08, "loss": 0.2812, "step": 548 }, { "epoch": 84.46153846153847, "grad_norm": 8.95349407196045, "learning_rate": 6.071428571428572e-08, "loss": 0.2835, "step": 549 }, { "epoch": 84.61538461538461, "grad_norm": 18.060441970825195, "learning_rate": 5.9523809523809515e-08, "loss": 0.2697, "step": 550 }, { "epoch": 84.76923076923077, "grad_norm": 15.820292472839355, "learning_rate": 5.833333333333333e-08, "loss": 0.3266, "step": 551 }, { "epoch": 84.92307692307692, "grad_norm": 10.503725051879883, "learning_rate": 5.714285714285714e-08, "loss": 0.2977, "step": 552 }, { "epoch": 84.92307692307692, "eval_accuracy": 0.8754578754578755, "eval_loss": 0.41494670510292053, "eval_runtime": 1.1373, "eval_samples_per_second": 240.052, "eval_steps_per_second": 4.397, "step": 552 }, { "epoch": 85.07692307692308, "grad_norm": 9.66882610321045, "learning_rate": 5.5952380952380955e-08, "loss": 0.3452, "step": 553 }, { "epoch": 85.23076923076923, "grad_norm": 9.360061645507812, "learning_rate": 5.476190476190476e-08, "loss": 0.3033, "step": 554 }, { "epoch": 85.38461538461539, "grad_norm": 10.878594398498535, "learning_rate": 5.3571428571428564e-08, "loss": 0.323, "step": 555 }, { "epoch": 85.53846153846153, "grad_norm": 9.788655281066895, "learning_rate": 5.238095238095238e-08, "loss": 0.2796, "step": 556 }, { "epoch": 85.6923076923077, "grad_norm": 11.249568939208984, "learning_rate": 5.1190476190476187e-08, "loss": 0.2381, "step": 557 }, { "epoch": 85.84615384615384, "grad_norm": 22.544105529785156, "learning_rate": 5e-08, "loss": 0.3593, "step": 558 }, { "epoch": 86.0, "grad_norm": 9.197429656982422, "learning_rate": 4.880952380952381e-08, "loss": 0.2611, "step": 559 }, { "epoch": 86.0, "eval_accuracy": 0.8717948717948718, "eval_loss": 0.4160268008708954, "eval_runtime": 1.1319, "eval_samples_per_second": 241.183, "eval_steps_per_second": 4.417, "step": 559 }, { "epoch": 86.15384615384616, "grad_norm": 10.480982780456543, "learning_rate": 4.7619047619047613e-08, "loss": 0.2919, "step": 560 }, { "epoch": 86.3076923076923, "grad_norm": 15.81564712524414, "learning_rate": 4.642857142857143e-08, "loss": 0.3395, "step": 561 }, { "epoch": 86.46153846153847, "grad_norm": 25.986629486083984, "learning_rate": 4.5238095238095236e-08, "loss": 0.3436, "step": 562 }, { "epoch": 86.61538461538461, "grad_norm": 20.077136993408203, "learning_rate": 4.404761904761905e-08, "loss": 0.363, "step": 563 }, { "epoch": 86.76923076923077, "grad_norm": 16.67424774169922, "learning_rate": 4.285714285714286e-08, "loss": 0.2731, "step": 564 }, { "epoch": 86.92307692307692, "grad_norm": 16.004140853881836, "learning_rate": 4.166666666666666e-08, "loss": 0.2542, "step": 565 }, { "epoch": 86.92307692307692, "eval_accuracy": 0.8717948717948718, "eval_loss": 0.4169901907444, "eval_runtime": 1.1368, "eval_samples_per_second": 240.144, "eval_steps_per_second": 4.398, "step": 565 }, { "epoch": 87.07692307692308, "grad_norm": 18.212156295776367, "learning_rate": 4.0476190476190474e-08, "loss": 0.3265, "step": 566 }, { "epoch": 87.23076923076923, "grad_norm": 12.494462966918945, "learning_rate": 3.9285714285714285e-08, "loss": 0.3226, "step": 567 }, { "epoch": 87.38461538461539, "grad_norm": 6.6637773513793945, "learning_rate": 3.8095238095238096e-08, "loss": 0.2262, "step": 568 }, { "epoch": 87.53846153846153, "grad_norm": 12.791160583496094, "learning_rate": 3.69047619047619e-08, "loss": 0.3541, "step": 569 }, { "epoch": 87.6923076923077, "grad_norm": 9.732351303100586, "learning_rate": 3.571428571428571e-08, "loss": 0.2551, "step": 570 }, { "epoch": 87.84615384615384, "grad_norm": 21.216760635375977, "learning_rate": 3.452380952380952e-08, "loss": 0.2629, "step": 571 }, { "epoch": 88.0, "grad_norm": 15.293636322021484, "learning_rate": 3.3333333333333334e-08, "loss": 0.2665, "step": 572 }, { "epoch": 88.0, "eval_accuracy": 0.8717948717948718, "eval_loss": 0.41707876324653625, "eval_runtime": 1.1463, "eval_samples_per_second": 238.151, "eval_steps_per_second": 4.362, "step": 572 }, { "epoch": 88.15384615384616, "grad_norm": 11.668240547180176, "learning_rate": 3.214285714285714e-08, "loss": 0.3315, "step": 573 }, { "epoch": 88.3076923076923, "grad_norm": 28.14773178100586, "learning_rate": 3.095238095238095e-08, "loss": 0.3098, "step": 574 }, { "epoch": 88.46153846153847, "grad_norm": 13.815803527832031, "learning_rate": 2.9761904761904758e-08, "loss": 0.3381, "step": 575 }, { "epoch": 88.61538461538461, "grad_norm": 11.341737747192383, "learning_rate": 2.857142857142857e-08, "loss": 0.2995, "step": 576 }, { "epoch": 88.76923076923077, "grad_norm": 9.191329002380371, "learning_rate": 2.738095238095238e-08, "loss": 0.2485, "step": 577 }, { "epoch": 88.92307692307692, "grad_norm": 12.781599998474121, "learning_rate": 2.619047619047619e-08, "loss": 0.2654, "step": 578 }, { "epoch": 88.92307692307692, "eval_accuracy": 0.8717948717948718, "eval_loss": 0.4170469343662262, "eval_runtime": 1.1144, "eval_samples_per_second": 244.964, "eval_steps_per_second": 4.487, "step": 578 }, { "epoch": 89.07692307692308, "grad_norm": 18.813859939575195, "learning_rate": 2.5e-08, "loss": 0.4085, "step": 579 }, { "epoch": 89.23076923076923, "grad_norm": 9.056866645812988, "learning_rate": 2.3809523809523807e-08, "loss": 0.2377, "step": 580 }, { "epoch": 89.38461538461539, "grad_norm": 21.75194549560547, "learning_rate": 2.2619047619047618e-08, "loss": 0.2511, "step": 581 }, { "epoch": 89.53846153846153, "grad_norm": 24.87982177734375, "learning_rate": 2.142857142857143e-08, "loss": 0.3468, "step": 582 }, { "epoch": 89.6923076923077, "grad_norm": 13.90104866027832, "learning_rate": 2.0238095238095237e-08, "loss": 0.2001, "step": 583 }, { "epoch": 89.84615384615384, "grad_norm": 13.514042854309082, "learning_rate": 1.9047619047619048e-08, "loss": 0.2673, "step": 584 }, { "epoch": 90.0, "grad_norm": 17.871490478515625, "learning_rate": 1.7857142857142856e-08, "loss": 0.3059, "step": 585 }, { "epoch": 90.0, "eval_accuracy": 0.8717948717948718, "eval_loss": 0.4172230660915375, "eval_runtime": 1.1156, "eval_samples_per_second": 244.721, "eval_steps_per_second": 4.482, "step": 585 }, { "epoch": 90.15384615384616, "grad_norm": 9.703531265258789, "learning_rate": 1.6666666666666667e-08, "loss": 0.1731, "step": 586 }, { "epoch": 90.3076923076923, "grad_norm": 13.011953353881836, "learning_rate": 1.5476190476190475e-08, "loss": 0.3008, "step": 587 }, { "epoch": 90.46153846153847, "grad_norm": 13.331348419189453, "learning_rate": 1.4285714285714284e-08, "loss": 0.2402, "step": 588 }, { "epoch": 90.61538461538461, "grad_norm": 9.919706344604492, "learning_rate": 1.3095238095238096e-08, "loss": 0.3376, "step": 589 }, { "epoch": 90.76923076923077, "grad_norm": 20.184898376464844, "learning_rate": 1.1904761904761903e-08, "loss": 0.3106, "step": 590 }, { "epoch": 90.92307692307692, "grad_norm": 6.9222588539123535, "learning_rate": 1.0714285714285715e-08, "loss": 0.2377, "step": 591 }, { "epoch": 90.92307692307692, "eval_accuracy": 0.8717948717948718, "eval_loss": 0.417271226644516, "eval_runtime": 1.1559, "eval_samples_per_second": 236.178, "eval_steps_per_second": 4.326, "step": 591 }, { "epoch": 91.07692307692308, "grad_norm": 10.644214630126953, "learning_rate": 9.523809523809524e-09, "loss": 0.2738, "step": 592 }, { "epoch": 91.23076923076923, "grad_norm": 12.530488014221191, "learning_rate": 8.333333333333334e-09, "loss": 0.3082, "step": 593 }, { "epoch": 91.38461538461539, "grad_norm": 10.068582534790039, "learning_rate": 7.142857142857142e-09, "loss": 0.2678, "step": 594 }, { "epoch": 91.53846153846153, "grad_norm": 8.5696439743042, "learning_rate": 5.952380952380952e-09, "loss": 0.2902, "step": 595 }, { "epoch": 91.6923076923077, "grad_norm": 22.662599563598633, "learning_rate": 4.761904761904762e-09, "loss": 0.2826, "step": 596 }, { "epoch": 91.84615384615384, "grad_norm": 19.666378021240234, "learning_rate": 3.571428571428571e-09, "loss": 0.3109, "step": 597 }, { "epoch": 92.0, "grad_norm": 15.434282302856445, "learning_rate": 2.380952380952381e-09, "loss": 0.2896, "step": 598 }, { "epoch": 92.0, "eval_accuracy": 0.8717948717948718, "eval_loss": 0.4172247350215912, "eval_runtime": 1.1283, "eval_samples_per_second": 241.962, "eval_steps_per_second": 4.432, "step": 598 }, { "epoch": 92.15384615384616, "grad_norm": 21.7716064453125, "learning_rate": 1.1904761904761905e-09, "loss": 0.3775, "step": 599 }, { "epoch": 92.3076923076923, "grad_norm": 9.635214805603027, "learning_rate": 0.0, "loss": 0.3133, "step": 600 }, { "epoch": 92.3076923076923, "eval_accuracy": 0.8717948717948718, "eval_loss": 0.4172203838825226, "eval_runtime": 1.1209, "eval_samples_per_second": 243.565, "eval_steps_per_second": 4.461, "step": 600 }, { "epoch": 92.3076923076923, "step": 600, "total_flos": 1.4722503891660472e+18, "train_loss": 0.3897706772387028, "train_runtime": 753.6703, "train_samples_per_second": 108.137, "train_steps_per_second": 0.796 } ], "logging_steps": 1, "max_steps": 600, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.4722503891660472e+18, "train_batch_size": 64, "trial_name": null, "trial_params": null }