|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 814, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.002457002457002457, |
|
"grad_norm": 4.916572570800781, |
|
"learning_rate": 2.4390243902439027e-06, |
|
"loss": 5.9819, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.004914004914004914, |
|
"grad_norm": 4.77026891708374, |
|
"learning_rate": 4.8780487804878055e-06, |
|
"loss": 5.953, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.007371007371007371, |
|
"grad_norm": 5.246749401092529, |
|
"learning_rate": 7.317073170731707e-06, |
|
"loss": 5.9223, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.009828009828009828, |
|
"grad_norm": 5.039796829223633, |
|
"learning_rate": 9.756097560975611e-06, |
|
"loss": 5.5883, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.012285012285012284, |
|
"grad_norm": 6.014425277709961, |
|
"learning_rate": 1.2195121951219513e-05, |
|
"loss": 5.539, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.014742014742014743, |
|
"grad_norm": 10.090555191040039, |
|
"learning_rate": 1.4634146341463415e-05, |
|
"loss": 5.2428, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0171990171990172, |
|
"grad_norm": 6.888723850250244, |
|
"learning_rate": 1.707317073170732e-05, |
|
"loss": 5.239, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.019656019656019656, |
|
"grad_norm": 4.933538436889648, |
|
"learning_rate": 1.9512195121951222e-05, |
|
"loss": 4.8712, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.022113022113022112, |
|
"grad_norm": 2.7991392612457275, |
|
"learning_rate": 2.1951219512195124e-05, |
|
"loss": 4.4757, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.02457002457002457, |
|
"grad_norm": 4.74180793762207, |
|
"learning_rate": 2.4390243902439026e-05, |
|
"loss": 4.8469, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.02702702702702703, |
|
"grad_norm": 4.044187068939209, |
|
"learning_rate": 2.682926829268293e-05, |
|
"loss": 5.049, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.029484029484029485, |
|
"grad_norm": 2.664706230163574, |
|
"learning_rate": 2.926829268292683e-05, |
|
"loss": 4.6624, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.03194103194103194, |
|
"grad_norm": 3.159350633621216, |
|
"learning_rate": 3.170731707317073e-05, |
|
"loss": 4.7993, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.0343980343980344, |
|
"grad_norm": 2.5463948249816895, |
|
"learning_rate": 3.414634146341464e-05, |
|
"loss": 4.4235, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.036855036855036855, |
|
"grad_norm": 4.2436113357543945, |
|
"learning_rate": 3.6585365853658535e-05, |
|
"loss": 4.4116, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03931203931203931, |
|
"grad_norm": 2.924164295196533, |
|
"learning_rate": 3.9024390243902444e-05, |
|
"loss": 4.2755, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.04176904176904177, |
|
"grad_norm": 2.3113248348236084, |
|
"learning_rate": 4.146341463414634e-05, |
|
"loss": 4.3122, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.044226044226044224, |
|
"grad_norm": 2.386974573135376, |
|
"learning_rate": 4.390243902439025e-05, |
|
"loss": 4.3088, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.04668304668304668, |
|
"grad_norm": 2.69736385345459, |
|
"learning_rate": 4.634146341463415e-05, |
|
"loss": 4.0982, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.04914004914004914, |
|
"grad_norm": 3.7191948890686035, |
|
"learning_rate": 4.878048780487805e-05, |
|
"loss": 4.1512, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.051597051597051594, |
|
"grad_norm": 2.4278435707092285, |
|
"learning_rate": 5.121951219512195e-05, |
|
"loss": 4.2134, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.05405405405405406, |
|
"grad_norm": 3.0863969326019287, |
|
"learning_rate": 5.365853658536586e-05, |
|
"loss": 4.2307, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.056511056511056514, |
|
"grad_norm": 3.4245433807373047, |
|
"learning_rate": 5.6097560975609764e-05, |
|
"loss": 4.0733, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.05896805896805897, |
|
"grad_norm": 3.0049750804901123, |
|
"learning_rate": 5.853658536585366e-05, |
|
"loss": 4.0946, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.06142506142506143, |
|
"grad_norm": 3.220999240875244, |
|
"learning_rate": 6.097560975609756e-05, |
|
"loss": 4.1286, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.06388206388206388, |
|
"grad_norm": 2.5109856128692627, |
|
"learning_rate": 6.341463414634146e-05, |
|
"loss": 3.997, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.06633906633906633, |
|
"grad_norm": 2.430593252182007, |
|
"learning_rate": 6.585365853658538e-05, |
|
"loss": 4.0292, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.0687960687960688, |
|
"grad_norm": 2.1860780715942383, |
|
"learning_rate": 6.829268292682928e-05, |
|
"loss": 3.8444, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.07125307125307126, |
|
"grad_norm": 2.2836203575134277, |
|
"learning_rate": 7.073170731707317e-05, |
|
"loss": 3.7986, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.07371007371007371, |
|
"grad_norm": 2.4450736045837402, |
|
"learning_rate": 7.317073170731707e-05, |
|
"loss": 3.9541, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.07616707616707617, |
|
"grad_norm": 2.628493309020996, |
|
"learning_rate": 7.560975609756099e-05, |
|
"loss": 3.8559, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.07862407862407862, |
|
"grad_norm": 2.1850173473358154, |
|
"learning_rate": 7.804878048780489e-05, |
|
"loss": 3.8247, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.08108108108108109, |
|
"grad_norm": 2.2945919036865234, |
|
"learning_rate": 8.048780487804879e-05, |
|
"loss": 3.915, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.08353808353808354, |
|
"grad_norm": 2.381680727005005, |
|
"learning_rate": 8.292682926829268e-05, |
|
"loss": 3.7283, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.085995085995086, |
|
"grad_norm": 2.36006498336792, |
|
"learning_rate": 8.53658536585366e-05, |
|
"loss": 3.6579, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.08845208845208845, |
|
"grad_norm": 3.086056709289551, |
|
"learning_rate": 8.78048780487805e-05, |
|
"loss": 3.7464, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.09090909090909091, |
|
"grad_norm": 3.201338529586792, |
|
"learning_rate": 9.02439024390244e-05, |
|
"loss": 3.8884, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.09336609336609336, |
|
"grad_norm": 2.7978038787841797, |
|
"learning_rate": 9.26829268292683e-05, |
|
"loss": 3.5131, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.09582309582309582, |
|
"grad_norm": 3.1305155754089355, |
|
"learning_rate": 9.51219512195122e-05, |
|
"loss": 3.6623, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.09828009828009827, |
|
"grad_norm": 2.570423126220703, |
|
"learning_rate": 9.75609756097561e-05, |
|
"loss": 3.7053, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.10073710073710074, |
|
"grad_norm": 2.9331843852996826, |
|
"learning_rate": 0.0001, |
|
"loss": 3.7604, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.10319410319410319, |
|
"grad_norm": 2.7446630001068115, |
|
"learning_rate": 9.999958706645134e-05, |
|
"loss": 3.5296, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.10565110565110565, |
|
"grad_norm": 3.145533323287964, |
|
"learning_rate": 9.999834827262588e-05, |
|
"loss": 3.6484, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.10810810810810811, |
|
"grad_norm": 2.7964529991149902, |
|
"learning_rate": 9.999628363898526e-05, |
|
"loss": 3.6839, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.11056511056511056, |
|
"grad_norm": 2.17167067527771, |
|
"learning_rate": 9.999339319963168e-05, |
|
"loss": 3.6529, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.11302211302211303, |
|
"grad_norm": 2.126905679702759, |
|
"learning_rate": 9.998967700230757e-05, |
|
"loss": 3.7187, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.11547911547911548, |
|
"grad_norm": 2.5139994621276855, |
|
"learning_rate": 9.998513510839458e-05, |
|
"loss": 3.6367, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.11793611793611794, |
|
"grad_norm": 2.635418653488159, |
|
"learning_rate": 9.997976759291276e-05, |
|
"loss": 3.5057, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.12039312039312039, |
|
"grad_norm": 2.3161840438842773, |
|
"learning_rate": 9.997357454451919e-05, |
|
"loss": 3.5836, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.12285012285012285, |
|
"grad_norm": 2.4925875663757324, |
|
"learning_rate": 9.996655606550656e-05, |
|
"loss": 3.559, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.12530712530712532, |
|
"grad_norm": 2.4260857105255127, |
|
"learning_rate": 9.99587122718015e-05, |
|
"loss": 3.6824, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.12776412776412777, |
|
"grad_norm": 4.071969509124756, |
|
"learning_rate": 9.995004329296263e-05, |
|
"loss": 3.8088, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.13022113022113022, |
|
"grad_norm": 2.3777356147766113, |
|
"learning_rate": 9.994054927217842e-05, |
|
"loss": 3.6324, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.13267813267813267, |
|
"grad_norm": 2.1969950199127197, |
|
"learning_rate": 9.993023036626488e-05, |
|
"loss": 3.5438, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.13513513513513514, |
|
"grad_norm": 2.2005531787872314, |
|
"learning_rate": 9.99190867456629e-05, |
|
"loss": 3.5784, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.1375921375921376, |
|
"grad_norm": 2.205366611480713, |
|
"learning_rate": 9.990711859443546e-05, |
|
"loss": 3.4397, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.14004914004914004, |
|
"grad_norm": 2.119697332382202, |
|
"learning_rate": 9.989432611026464e-05, |
|
"loss": 3.2832, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.14250614250614252, |
|
"grad_norm": 2.2541282176971436, |
|
"learning_rate": 9.988070950444823e-05, |
|
"loss": 3.4448, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.14496314496314497, |
|
"grad_norm": 2.172215461730957, |
|
"learning_rate": 9.986626900189641e-05, |
|
"loss": 3.4868, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.14742014742014742, |
|
"grad_norm": 2.158733606338501, |
|
"learning_rate": 9.985100484112785e-05, |
|
"loss": 3.4516, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.14987714987714987, |
|
"grad_norm": 2.2100753784179688, |
|
"learning_rate": 9.983491727426598e-05, |
|
"loss": 3.4269, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.15233415233415235, |
|
"grad_norm": 2.00675106048584, |
|
"learning_rate": 9.981800656703457e-05, |
|
"loss": 3.5985, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.1547911547911548, |
|
"grad_norm": 2.1808011531829834, |
|
"learning_rate": 9.980027299875358e-05, |
|
"loss": 3.4741, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.15724815724815724, |
|
"grad_norm": 2.197129011154175, |
|
"learning_rate": 9.978171686233445e-05, |
|
"loss": 3.3629, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.1597051597051597, |
|
"grad_norm": 2.001347780227661, |
|
"learning_rate": 9.97623384642752e-05, |
|
"loss": 3.4066, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.16216216216216217, |
|
"grad_norm": 2.0654983520507812, |
|
"learning_rate": 9.974213812465547e-05, |
|
"loss": 3.473, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.16461916461916462, |
|
"grad_norm": 2.565753698348999, |
|
"learning_rate": 9.972111617713116e-05, |
|
"loss": 3.3103, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.16707616707616707, |
|
"grad_norm": 2.33060884475708, |
|
"learning_rate": 9.969927296892898e-05, |
|
"loss": 3.4601, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.16953316953316952, |
|
"grad_norm": 2.2303621768951416, |
|
"learning_rate": 9.967660886084066e-05, |
|
"loss": 3.3306, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.171990171990172, |
|
"grad_norm": 2.0361993312835693, |
|
"learning_rate": 9.965312422721704e-05, |
|
"loss": 3.2357, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.17444717444717445, |
|
"grad_norm": 2.2737162113189697, |
|
"learning_rate": 9.962881945596184e-05, |
|
"loss": 3.3979, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.1769041769041769, |
|
"grad_norm": 1.971992015838623, |
|
"learning_rate": 9.960369494852525e-05, |
|
"loss": 3.1709, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.17936117936117937, |
|
"grad_norm": 2.1188580989837646, |
|
"learning_rate": 9.95777511198974e-05, |
|
"loss": 3.2497, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.18181818181818182, |
|
"grad_norm": 2.1659576892852783, |
|
"learning_rate": 9.955098839860133e-05, |
|
"loss": 3.4369, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.18427518427518427, |
|
"grad_norm": 2.2103657722473145, |
|
"learning_rate": 9.952340722668609e-05, |
|
"loss": 3.2721, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.18673218673218672, |
|
"grad_norm": 2.0780789852142334, |
|
"learning_rate": 9.949500805971932e-05, |
|
"loss": 3.3651, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.1891891891891892, |
|
"grad_norm": 2.086878538131714, |
|
"learning_rate": 9.946579136677978e-05, |
|
"loss": 3.3741, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.19164619164619165, |
|
"grad_norm": 2.0054385662078857, |
|
"learning_rate": 9.943575763044955e-05, |
|
"loss": 3.1859, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.1941031941031941, |
|
"grad_norm": 2.0755085945129395, |
|
"learning_rate": 9.940490734680614e-05, |
|
"loss": 3.2251, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.19656019656019655, |
|
"grad_norm": 3.059140920639038, |
|
"learning_rate": 9.937324102541423e-05, |
|
"loss": 3.2752, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.19901719901719903, |
|
"grad_norm": 2.0540404319763184, |
|
"learning_rate": 9.93407591893173e-05, |
|
"loss": 3.3965, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.20147420147420148, |
|
"grad_norm": 2.07104754447937, |
|
"learning_rate": 9.930746237502892e-05, |
|
"loss": 3.2951, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.20393120393120392, |
|
"grad_norm": 2.12972354888916, |
|
"learning_rate": 9.927335113252396e-05, |
|
"loss": 3.4232, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.20638820638820637, |
|
"grad_norm": 2.369746685028076, |
|
"learning_rate": 9.923842602522949e-05, |
|
"loss": 3.2721, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.20884520884520885, |
|
"grad_norm": 2.08709454536438, |
|
"learning_rate": 9.920268763001542e-05, |
|
"loss": 3.2015, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.2113022113022113, |
|
"grad_norm": 2.0221400260925293, |
|
"learning_rate": 9.916613653718509e-05, |
|
"loss": 3.3356, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.21375921375921375, |
|
"grad_norm": 2.353618860244751, |
|
"learning_rate": 9.912877335046535e-05, |
|
"loss": 3.2471, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.21621621621621623, |
|
"grad_norm": 3.3538386821746826, |
|
"learning_rate": 9.909059868699678e-05, |
|
"loss": 3.1677, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.21867321867321868, |
|
"grad_norm": 2.3216552734375, |
|
"learning_rate": 9.905161317732331e-05, |
|
"loss": 3.2548, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.22113022113022113, |
|
"grad_norm": 2.058231830596924, |
|
"learning_rate": 9.901181746538196e-05, |
|
"loss": 3.1647, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.22358722358722358, |
|
"grad_norm": 2.3197808265686035, |
|
"learning_rate": 9.897121220849208e-05, |
|
"loss": 3.1369, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.22604422604422605, |
|
"grad_norm": 2.5758109092712402, |
|
"learning_rate": 9.892979807734462e-05, |
|
"loss": 3.206, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.2285012285012285, |
|
"grad_norm": 2.34597110748291, |
|
"learning_rate": 9.888757575599093e-05, |
|
"loss": 3.2325, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.23095823095823095, |
|
"grad_norm": 2.0011579990386963, |
|
"learning_rate": 9.884454594183154e-05, |
|
"loss": 3.2801, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.2334152334152334, |
|
"grad_norm": 2.094426155090332, |
|
"learning_rate": 9.880070934560458e-05, |
|
"loss": 3.207, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.23587223587223588, |
|
"grad_norm": 2.8441734313964844, |
|
"learning_rate": 9.875606669137412e-05, |
|
"loss": 3.2559, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.23832923832923833, |
|
"grad_norm": 2.490898370742798, |
|
"learning_rate": 9.871061871651815e-05, |
|
"loss": 2.9974, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.24078624078624078, |
|
"grad_norm": 2.0442817211151123, |
|
"learning_rate": 9.866436617171638e-05, |
|
"loss": 3.357, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.24324324324324326, |
|
"grad_norm": 2.0015676021575928, |
|
"learning_rate": 9.861730982093793e-05, |
|
"loss": 3.2276, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.2457002457002457, |
|
"grad_norm": 2.024724245071411, |
|
"learning_rate": 9.856945044142865e-05, |
|
"loss": 3.2181, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.24815724815724816, |
|
"grad_norm": 2.0588181018829346, |
|
"learning_rate": 9.852078882369827e-05, |
|
"loss": 3.1506, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.25061425061425063, |
|
"grad_norm": 2.1806745529174805, |
|
"learning_rate": 9.847132577150733e-05, |
|
"loss": 3.1492, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.25307125307125306, |
|
"grad_norm": 2.226822853088379, |
|
"learning_rate": 9.842106210185403e-05, |
|
"loss": 3.1743, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.25552825552825553, |
|
"grad_norm": 2.317154884338379, |
|
"learning_rate": 9.836999864496057e-05, |
|
"loss": 3.0891, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.257985257985258, |
|
"grad_norm": 2.1667487621307373, |
|
"learning_rate": 9.831813624425952e-05, |
|
"loss": 3.1981, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.26044226044226043, |
|
"grad_norm": 1.8015401363372803, |
|
"learning_rate": 9.82654757563799e-05, |
|
"loss": 2.9991, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.2628992628992629, |
|
"grad_norm": 2.3263139724731445, |
|
"learning_rate": 9.821201805113298e-05, |
|
"loss": 3.0811, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.26535626535626533, |
|
"grad_norm": 2.251429796218872, |
|
"learning_rate": 9.815776401149796e-05, |
|
"loss": 3.222, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.2678132678132678, |
|
"grad_norm": 2.134190320968628, |
|
"learning_rate": 9.810271453360738e-05, |
|
"loss": 3.0341, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.2702702702702703, |
|
"grad_norm": 1.9004825353622437, |
|
"learning_rate": 9.804687052673229e-05, |
|
"loss": 3.0711, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.2727272727272727, |
|
"grad_norm": 2.2338693141937256, |
|
"learning_rate": 9.799023291326722e-05, |
|
"loss": 3.1774, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.2751842751842752, |
|
"grad_norm": 2.2935733795166016, |
|
"learning_rate": 9.793280262871502e-05, |
|
"loss": 3.1498, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.27764127764127766, |
|
"grad_norm": 2.144819736480713, |
|
"learning_rate": 9.787458062167134e-05, |
|
"loss": 3.0688, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.2800982800982801, |
|
"grad_norm": 1.9869056940078735, |
|
"learning_rate": 9.781556785380899e-05, |
|
"loss": 3.3, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.28255528255528256, |
|
"grad_norm": 1.8830708265304565, |
|
"learning_rate": 9.775576529986199e-05, |
|
"loss": 3.3633, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.28501228501228504, |
|
"grad_norm": 2.0476791858673096, |
|
"learning_rate": 9.769517394760962e-05, |
|
"loss": 3.0215, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.28746928746928746, |
|
"grad_norm": 2.008009433746338, |
|
"learning_rate": 9.763379479785995e-05, |
|
"loss": 3.1849, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.28992628992628994, |
|
"grad_norm": 2.0959136486053467, |
|
"learning_rate": 9.757162886443336e-05, |
|
"loss": 3.1097, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.29238329238329236, |
|
"grad_norm": 1.8712128400802612, |
|
"learning_rate": 9.750867717414586e-05, |
|
"loss": 3.1607, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.29484029484029484, |
|
"grad_norm": 1.8421741724014282, |
|
"learning_rate": 9.744494076679205e-05, |
|
"loss": 3.1217, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.2972972972972973, |
|
"grad_norm": 1.9873462915420532, |
|
"learning_rate": 9.738042069512795e-05, |
|
"loss": 3.2857, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.29975429975429974, |
|
"grad_norm": 1.790082573890686, |
|
"learning_rate": 9.731511802485364e-05, |
|
"loss": 3.0234, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.3022113022113022, |
|
"grad_norm": 1.915837049484253, |
|
"learning_rate": 9.724903383459566e-05, |
|
"loss": 3.1364, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.3046683046683047, |
|
"grad_norm": 2.401880979537964, |
|
"learning_rate": 9.718216921588919e-05, |
|
"loss": 2.975, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.3071253071253071, |
|
"grad_norm": 2.079129695892334, |
|
"learning_rate": 9.711452527315998e-05, |
|
"loss": 3.2686, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.3095823095823096, |
|
"grad_norm": 1.9599655866622925, |
|
"learning_rate": 9.704610312370617e-05, |
|
"loss": 3.0804, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.31203931203931207, |
|
"grad_norm": 2.091503858566284, |
|
"learning_rate": 9.697690389767981e-05, |
|
"loss": 3.1895, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.3144963144963145, |
|
"grad_norm": 2.054234743118286, |
|
"learning_rate": 9.690692873806816e-05, |
|
"loss": 3.2141, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.31695331695331697, |
|
"grad_norm": 2.081094264984131, |
|
"learning_rate": 9.683617880067489e-05, |
|
"loss": 3.2369, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.3194103194103194, |
|
"grad_norm": 2.1534552574157715, |
|
"learning_rate": 9.676465525410088e-05, |
|
"loss": 3.1827, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.32186732186732187, |
|
"grad_norm": 1.9752203226089478, |
|
"learning_rate": 9.669235927972502e-05, |
|
"loss": 3.2592, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.32432432432432434, |
|
"grad_norm": 1.8343819379806519, |
|
"learning_rate": 9.661929207168463e-05, |
|
"loss": 2.9579, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.32678132678132676, |
|
"grad_norm": 2.117063283920288, |
|
"learning_rate": 9.654545483685578e-05, |
|
"loss": 2.9743, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.32923832923832924, |
|
"grad_norm": 1.9216965436935425, |
|
"learning_rate": 9.647084879483332e-05, |
|
"loss": 3.1944, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.3316953316953317, |
|
"grad_norm": 2.0310311317443848, |
|
"learning_rate": 9.639547517791076e-05, |
|
"loss": 3.088, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.33415233415233414, |
|
"grad_norm": 2.0127720832824707, |
|
"learning_rate": 9.631933523105991e-05, |
|
"loss": 3.0136, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.3366093366093366, |
|
"grad_norm": 1.8773679733276367, |
|
"learning_rate": 9.624243021191029e-05, |
|
"loss": 3.0069, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.33906633906633904, |
|
"grad_norm": 2.039201259613037, |
|
"learning_rate": 9.61647613907284e-05, |
|
"loss": 3.107, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.3415233415233415, |
|
"grad_norm": 2.52468204498291, |
|
"learning_rate": 9.608633005039675e-05, |
|
"loss": 3.1008, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.343980343980344, |
|
"grad_norm": 1.853696346282959, |
|
"learning_rate": 9.600713748639258e-05, |
|
"loss": 3.1372, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.3464373464373464, |
|
"grad_norm": 2.019024133682251, |
|
"learning_rate": 9.592718500676656e-05, |
|
"loss": 3.0446, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.3488943488943489, |
|
"grad_norm": 2.2153067588806152, |
|
"learning_rate": 9.584647393212113e-05, |
|
"loss": 3.0135, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.35135135135135137, |
|
"grad_norm": 1.997916579246521, |
|
"learning_rate": 9.576500559558869e-05, |
|
"loss": 3.0678, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.3538083538083538, |
|
"grad_norm": 1.8070591688156128, |
|
"learning_rate": 9.568278134280966e-05, |
|
"loss": 2.9913, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.35626535626535627, |
|
"grad_norm": 1.8850058317184448, |
|
"learning_rate": 9.55998025319101e-05, |
|
"loss": 3.0786, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.35872235872235875, |
|
"grad_norm": 1.8264799118041992, |
|
"learning_rate": 9.551607053347942e-05, |
|
"loss": 3.1469, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.36117936117936117, |
|
"grad_norm": 1.955801010131836, |
|
"learning_rate": 9.543158673054767e-05, |
|
"loss": 2.9848, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.36363636363636365, |
|
"grad_norm": 1.8326199054718018, |
|
"learning_rate": 9.534635251856267e-05, |
|
"loss": 2.7434, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.36609336609336607, |
|
"grad_norm": 1.8823051452636719, |
|
"learning_rate": 9.526036930536712e-05, |
|
"loss": 2.9899, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.36855036855036855, |
|
"grad_norm": 1.818613886833191, |
|
"learning_rate": 9.517363851117512e-05, |
|
"loss": 3.0477, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.371007371007371, |
|
"grad_norm": 1.9045586585998535, |
|
"learning_rate": 9.508616156854883e-05, |
|
"loss": 3.1337, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.37346437346437344, |
|
"grad_norm": 2.1247949600219727, |
|
"learning_rate": 9.499793992237485e-05, |
|
"loss": 2.8107, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.3759213759213759, |
|
"grad_norm": 1.9648290872573853, |
|
"learning_rate": 9.490897502984028e-05, |
|
"loss": 2.9724, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.3783783783783784, |
|
"grad_norm": 2.0654420852661133, |
|
"learning_rate": 9.481926836040866e-05, |
|
"loss": 2.9909, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.3808353808353808, |
|
"grad_norm": 1.8634198904037476, |
|
"learning_rate": 9.472882139579572e-05, |
|
"loss": 3.1364, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.3832923832923833, |
|
"grad_norm": 1.93793785572052, |
|
"learning_rate": 9.463763562994491e-05, |
|
"loss": 3.0364, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.3857493857493858, |
|
"grad_norm": 2.107790231704712, |
|
"learning_rate": 9.454571256900272e-05, |
|
"loss": 2.904, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.3882063882063882, |
|
"grad_norm": 1.9009639024734497, |
|
"learning_rate": 9.445305373129375e-05, |
|
"loss": 2.975, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.3906633906633907, |
|
"grad_norm": 2.067288875579834, |
|
"learning_rate": 9.435966064729574e-05, |
|
"loss": 3.2711, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.3931203931203931, |
|
"grad_norm": 2.018033981323242, |
|
"learning_rate": 9.426553485961415e-05, |
|
"loss": 3.1024, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.3955773955773956, |
|
"grad_norm": 1.9113644361495972, |
|
"learning_rate": 9.417067792295684e-05, |
|
"loss": 2.9457, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.39803439803439805, |
|
"grad_norm": 1.9023360013961792, |
|
"learning_rate": 9.407509140410826e-05, |
|
"loss": 2.9794, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.4004914004914005, |
|
"grad_norm": 1.987796425819397, |
|
"learning_rate": 9.397877688190362e-05, |
|
"loss": 2.8287, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.40294840294840295, |
|
"grad_norm": 1.9867451190948486, |
|
"learning_rate": 9.388173594720281e-05, |
|
"loss": 2.9206, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.40540540540540543, |
|
"grad_norm": 2.01375150680542, |
|
"learning_rate": 9.378397020286417e-05, |
|
"loss": 3.0713, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.40786240786240785, |
|
"grad_norm": 1.8884034156799316, |
|
"learning_rate": 9.368548126371788e-05, |
|
"loss": 3.0123, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.4103194103194103, |
|
"grad_norm": 1.8966444730758667, |
|
"learning_rate": 9.358627075653946e-05, |
|
"loss": 2.9806, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.41277641277641275, |
|
"grad_norm": 2.0271332263946533, |
|
"learning_rate": 9.348634032002277e-05, |
|
"loss": 3.0556, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.4152334152334152, |
|
"grad_norm": 2.061434507369995, |
|
"learning_rate": 9.338569160475299e-05, |
|
"loss": 2.8075, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.4176904176904177, |
|
"grad_norm": 1.9971458911895752, |
|
"learning_rate": 9.328432627317938e-05, |
|
"loss": 2.9535, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.4201474201474201, |
|
"grad_norm": 1.8785593509674072, |
|
"learning_rate": 9.318224599958778e-05, |
|
"loss": 2.9911, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.4226044226044226, |
|
"grad_norm": 1.8607487678527832, |
|
"learning_rate": 9.307945247007299e-05, |
|
"loss": 3.0419, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.4250614250614251, |
|
"grad_norm": 1.9788838624954224, |
|
"learning_rate": 9.297594738251086e-05, |
|
"loss": 3.0239, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.4275184275184275, |
|
"grad_norm": 1.7801693677902222, |
|
"learning_rate": 9.287173244653032e-05, |
|
"loss": 2.9975, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.42997542997543, |
|
"grad_norm": 2.19821834564209, |
|
"learning_rate": 9.276680938348512e-05, |
|
"loss": 2.9937, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.43243243243243246, |
|
"grad_norm": 2.122514247894287, |
|
"learning_rate": 9.266117992642536e-05, |
|
"loss": 3.0017, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.4348894348894349, |
|
"grad_norm": 1.9420276880264282, |
|
"learning_rate": 9.25548458200689e-05, |
|
"loss": 2.9522, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.43734643734643736, |
|
"grad_norm": 2.0469229221343994, |
|
"learning_rate": 9.244780882077254e-05, |
|
"loss": 2.7391, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.4398034398034398, |
|
"grad_norm": 2.0379199981689453, |
|
"learning_rate": 9.2340070696503e-05, |
|
"loss": 3.0342, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.44226044226044225, |
|
"grad_norm": 1.8155155181884766, |
|
"learning_rate": 9.223163322680772e-05, |
|
"loss": 2.9597, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.44471744471744473, |
|
"grad_norm": 1.802756905555725, |
|
"learning_rate": 9.212249820278545e-05, |
|
"loss": 2.905, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.44717444717444715, |
|
"grad_norm": 1.8715476989746094, |
|
"learning_rate": 9.201266742705672e-05, |
|
"loss": 3.1148, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.44963144963144963, |
|
"grad_norm": 1.8322906494140625, |
|
"learning_rate": 9.190214271373398e-05, |
|
"loss": 2.8966, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.4520884520884521, |
|
"grad_norm": 1.8648380041122437, |
|
"learning_rate": 9.179092588839178e-05, |
|
"loss": 2.7409, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.45454545454545453, |
|
"grad_norm": 1.8744274377822876, |
|
"learning_rate": 9.167901878803638e-05, |
|
"loss": 2.8532, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.457002457002457, |
|
"grad_norm": 1.9579302072525024, |
|
"learning_rate": 9.156642326107565e-05, |
|
"loss": 2.9217, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.4594594594594595, |
|
"grad_norm": 1.7740904092788696, |
|
"learning_rate": 9.145314116728841e-05, |
|
"loss": 3.0257, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.4619164619164619, |
|
"grad_norm": 1.762803316116333, |
|
"learning_rate": 9.133917437779375e-05, |
|
"loss": 2.9256, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.4643734643734644, |
|
"grad_norm": 1.6586060523986816, |
|
"learning_rate": 9.12245247750201e-05, |
|
"loss": 2.9535, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.4668304668304668, |
|
"grad_norm": 1.8642548322677612, |
|
"learning_rate": 9.110919425267415e-05, |
|
"loss": 3.0129, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.4692874692874693, |
|
"grad_norm": 1.9764457941055298, |
|
"learning_rate": 9.099318471570957e-05, |
|
"loss": 2.9355, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.47174447174447176, |
|
"grad_norm": 1.7947139739990234, |
|
"learning_rate": 9.087649808029554e-05, |
|
"loss": 2.8716, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.4742014742014742, |
|
"grad_norm": 2.3146283626556396, |
|
"learning_rate": 9.075913627378513e-05, |
|
"loss": 2.9939, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.47665847665847666, |
|
"grad_norm": 1.7969456911087036, |
|
"learning_rate": 9.064110123468345e-05, |
|
"loss": 2.997, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.47911547911547914, |
|
"grad_norm": 1.8634977340698242, |
|
"learning_rate": 9.052239491261559e-05, |
|
"loss": 3.0475, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.48157248157248156, |
|
"grad_norm": 1.8688544034957886, |
|
"learning_rate": 9.040301926829445e-05, |
|
"loss": 3.0927, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.48402948402948404, |
|
"grad_norm": 1.8441022634506226, |
|
"learning_rate": 9.028297627348835e-05, |
|
"loss": 2.7731, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.4864864864864865, |
|
"grad_norm": 1.8934268951416016, |
|
"learning_rate": 9.016226791098851e-05, |
|
"loss": 3.0989, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.48894348894348894, |
|
"grad_norm": 1.7705714702606201, |
|
"learning_rate": 9.004089617457625e-05, |
|
"loss": 2.8965, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.4914004914004914, |
|
"grad_norm": 1.8468502759933472, |
|
"learning_rate": 8.991886306899002e-05, |
|
"loss": 3.0472, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.49385749385749383, |
|
"grad_norm": 1.8708940744400024, |
|
"learning_rate": 8.979617060989234e-05, |
|
"loss": 3.1242, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.4963144963144963, |
|
"grad_norm": 1.8563363552093506, |
|
"learning_rate": 8.967282082383652e-05, |
|
"loss": 3.0993, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.4987714987714988, |
|
"grad_norm": 1.8156354427337646, |
|
"learning_rate": 8.954881574823317e-05, |
|
"loss": 2.9281, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.5012285012285013, |
|
"grad_norm": 1.8335216045379639, |
|
"learning_rate": 8.942415743131651e-05, |
|
"loss": 2.8471, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.5036855036855037, |
|
"grad_norm": 1.8024260997772217, |
|
"learning_rate": 8.92988479321106e-05, |
|
"loss": 2.7557, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.5061425061425061, |
|
"grad_norm": 1.8545125722885132, |
|
"learning_rate": 8.917288932039529e-05, |
|
"loss": 3.0465, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.5085995085995086, |
|
"grad_norm": 1.9025720357894897, |
|
"learning_rate": 8.904628367667202e-05, |
|
"loss": 2.9257, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.5110565110565111, |
|
"grad_norm": 1.7344011068344116, |
|
"learning_rate": 8.891903309212952e-05, |
|
"loss": 2.8647, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.5135135135135135, |
|
"grad_norm": 2.9021973609924316, |
|
"learning_rate": 8.87911396686092e-05, |
|
"loss": 2.8066, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.515970515970516, |
|
"grad_norm": 2.0105724334716797, |
|
"learning_rate": 8.866260551857045e-05, |
|
"loss": 3.0139, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.5184275184275184, |
|
"grad_norm": 1.9714957475662231, |
|
"learning_rate": 8.853343276505581e-05, |
|
"loss": 2.8111, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.5208845208845209, |
|
"grad_norm": 1.8330267667770386, |
|
"learning_rate": 8.840362354165581e-05, |
|
"loss": 2.8983, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.5233415233415234, |
|
"grad_norm": 2.052891969680786, |
|
"learning_rate": 8.827317999247378e-05, |
|
"loss": 2.9208, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.5257985257985258, |
|
"grad_norm": 1.8395956754684448, |
|
"learning_rate": 8.81421042720904e-05, |
|
"loss": 3.0045, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.5282555282555282, |
|
"grad_norm": 2.1694936752319336, |
|
"learning_rate": 8.801039854552821e-05, |
|
"loss": 2.9298, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.5307125307125307, |
|
"grad_norm": 1.9098365306854248, |
|
"learning_rate": 8.787806498821571e-05, |
|
"loss": 2.8694, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.5331695331695332, |
|
"grad_norm": 1.8973677158355713, |
|
"learning_rate": 8.774510578595153e-05, |
|
"loss": 3.0031, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.5356265356265356, |
|
"grad_norm": 1.8934143781661987, |
|
"learning_rate": 8.761152313486824e-05, |
|
"loss": 2.9812, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.538083538083538, |
|
"grad_norm": 1.8013354539871216, |
|
"learning_rate": 8.747731924139622e-05, |
|
"loss": 2.7604, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.5405405405405406, |
|
"grad_norm": 1.733801245689392, |
|
"learning_rate": 8.734249632222702e-05, |
|
"loss": 2.9479, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.542997542997543, |
|
"grad_norm": 1.780144214630127, |
|
"learning_rate": 8.720705660427692e-05, |
|
"loss": 2.8673, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.5454545454545454, |
|
"grad_norm": 1.930598258972168, |
|
"learning_rate": 8.707100232465007e-05, |
|
"loss": 2.85, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.547911547911548, |
|
"grad_norm": 1.886904239654541, |
|
"learning_rate": 8.69343357306015e-05, |
|
"loss": 2.8363, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.5503685503685504, |
|
"grad_norm": 2.0362017154693604, |
|
"learning_rate": 8.67970590795001e-05, |
|
"loss": 2.6025, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.5528255528255528, |
|
"grad_norm": 1.8580693006515503, |
|
"learning_rate": 8.665917463879125e-05, |
|
"loss": 3.1713, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.5552825552825553, |
|
"grad_norm": 1.9810400009155273, |
|
"learning_rate": 8.65206846859594e-05, |
|
"loss": 2.9814, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.5577395577395577, |
|
"grad_norm": 1.824465274810791, |
|
"learning_rate": 8.638159150849046e-05, |
|
"loss": 3.0354, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.5601965601965602, |
|
"grad_norm": 1.7572405338287354, |
|
"learning_rate": 8.6241897403834e-05, |
|
"loss": 2.9922, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.5626535626535627, |
|
"grad_norm": 1.7362189292907715, |
|
"learning_rate": 8.610160467936533e-05, |
|
"loss": 2.9658, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.5651105651105651, |
|
"grad_norm": 1.782990574836731, |
|
"learning_rate": 8.596071565234733e-05, |
|
"loss": 3.0351, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.5675675675675675, |
|
"grad_norm": 1.6694259643554688, |
|
"learning_rate": 8.581923264989228e-05, |
|
"loss": 2.8801, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.5700245700245701, |
|
"grad_norm": 1.7287397384643555, |
|
"learning_rate": 8.567715800892326e-05, |
|
"loss": 3.0199, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.5724815724815725, |
|
"grad_norm": 1.8691298961639404, |
|
"learning_rate": 8.553449407613572e-05, |
|
"loss": 2.9934, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.5749385749385749, |
|
"grad_norm": 1.8286222219467163, |
|
"learning_rate": 8.539124320795862e-05, |
|
"loss": 2.9836, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.5773955773955773, |
|
"grad_norm": 1.658504843711853, |
|
"learning_rate": 8.524740777051555e-05, |
|
"loss": 2.7834, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.5798525798525799, |
|
"grad_norm": 2.0098395347595215, |
|
"learning_rate": 8.510299013958558e-05, |
|
"loss": 3.0716, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.5823095823095823, |
|
"grad_norm": 1.998702883720398, |
|
"learning_rate": 8.495799270056412e-05, |
|
"loss": 2.9943, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.5847665847665847, |
|
"grad_norm": 1.7779194116592407, |
|
"learning_rate": 8.481241784842344e-05, |
|
"loss": 2.7784, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.5872235872235873, |
|
"grad_norm": 1.8476285934448242, |
|
"learning_rate": 8.466626798767318e-05, |
|
"loss": 3.0224, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.5896805896805897, |
|
"grad_norm": 1.759994626045227, |
|
"learning_rate": 8.451954553232055e-05, |
|
"loss": 3.1009, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.5921375921375921, |
|
"grad_norm": 1.7729698419570923, |
|
"learning_rate": 8.437225290583051e-05, |
|
"loss": 2.7858, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.5945945945945946, |
|
"grad_norm": 1.8636449575424194, |
|
"learning_rate": 8.422439254108576e-05, |
|
"loss": 2.974, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.597051597051597, |
|
"grad_norm": 1.938599944114685, |
|
"learning_rate": 8.407596688034648e-05, |
|
"loss": 3.0187, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.5995085995085995, |
|
"grad_norm": 1.7237671613693237, |
|
"learning_rate": 8.392697837521007e-05, |
|
"loss": 2.8565, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.601965601965602, |
|
"grad_norm": 2.4623801708221436, |
|
"learning_rate": 8.37774294865706e-05, |
|
"loss": 2.7818, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.6044226044226044, |
|
"grad_norm": 1.7916040420532227, |
|
"learning_rate": 8.362732268457824e-05, |
|
"loss": 3.1151, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.6068796068796068, |
|
"grad_norm": 1.8052140474319458, |
|
"learning_rate": 8.347666044859833e-05, |
|
"loss": 2.8566, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.6093366093366094, |
|
"grad_norm": 1.9497567415237427, |
|
"learning_rate": 8.332544526717057e-05, |
|
"loss": 2.8451, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.6117936117936118, |
|
"grad_norm": 1.9246710538864136, |
|
"learning_rate": 8.317367963796778e-05, |
|
"loss": 2.9308, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.6142506142506142, |
|
"grad_norm": 1.8030129671096802, |
|
"learning_rate": 8.30213660677548e-05, |
|
"loss": 2.8824, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6167076167076168, |
|
"grad_norm": 1.8339178562164307, |
|
"learning_rate": 8.286850707234691e-05, |
|
"loss": 2.8304, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.6191646191646192, |
|
"grad_norm": 2.070434808731079, |
|
"learning_rate": 8.271510517656845e-05, |
|
"loss": 2.8802, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.6216216216216216, |
|
"grad_norm": 2.7643368244171143, |
|
"learning_rate": 8.256116291421094e-05, |
|
"loss": 3.0449, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.6240786240786241, |
|
"grad_norm": 2.0422732830047607, |
|
"learning_rate": 8.24066828279914e-05, |
|
"loss": 2.9059, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.6265356265356266, |
|
"grad_norm": 2.012234687805176, |
|
"learning_rate": 8.225166746951023e-05, |
|
"loss": 3.0151, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.628992628992629, |
|
"grad_norm": 1.7563395500183105, |
|
"learning_rate": 8.209611939920912e-05, |
|
"loss": 2.6551, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.6314496314496314, |
|
"grad_norm": 1.9222311973571777, |
|
"learning_rate": 8.194004118632873e-05, |
|
"loss": 2.8514, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.6339066339066339, |
|
"grad_norm": 1.8835155963897705, |
|
"learning_rate": 8.178343540886626e-05, |
|
"loss": 2.883, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.6363636363636364, |
|
"grad_norm": 1.8356646299362183, |
|
"learning_rate": 8.162630465353292e-05, |
|
"loss": 2.7415, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.6388206388206388, |
|
"grad_norm": 1.890042781829834, |
|
"learning_rate": 8.146865151571108e-05, |
|
"loss": 2.82, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.6412776412776413, |
|
"grad_norm": 1.9901831150054932, |
|
"learning_rate": 8.131047859941156e-05, |
|
"loss": 2.7657, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.6437346437346437, |
|
"grad_norm": 1.8286992311477661, |
|
"learning_rate": 8.11517885172305e-05, |
|
"loss": 2.9325, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.6461916461916462, |
|
"grad_norm": 1.817006230354309, |
|
"learning_rate": 8.099258389030624e-05, |
|
"loss": 2.6395, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.6486486486486487, |
|
"grad_norm": 1.7543143033981323, |
|
"learning_rate": 8.083286734827605e-05, |
|
"loss": 2.847, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.6511056511056511, |
|
"grad_norm": 1.8252453804016113, |
|
"learning_rate": 8.067264152923268e-05, |
|
"loss": 2.8588, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.6535626535626535, |
|
"grad_norm": 1.9331204891204834, |
|
"learning_rate": 8.051190907968076e-05, |
|
"loss": 2.7571, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.6560196560196561, |
|
"grad_norm": 1.7939069271087646, |
|
"learning_rate": 8.035067265449312e-05, |
|
"loss": 2.7728, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.6584766584766585, |
|
"grad_norm": 4.82619571685791, |
|
"learning_rate": 8.018893491686692e-05, |
|
"loss": 2.891, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.6609336609336609, |
|
"grad_norm": 1.9427580833435059, |
|
"learning_rate": 8.00266985382797e-05, |
|
"loss": 2.9047, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 0.6633906633906634, |
|
"grad_norm": 1.8458834886550903, |
|
"learning_rate": 7.986396619844519e-05, |
|
"loss": 2.8492, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.6658476658476659, |
|
"grad_norm": 1.9331218004226685, |
|
"learning_rate": 7.970074058526908e-05, |
|
"loss": 2.8547, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 0.6683046683046683, |
|
"grad_norm": 1.8768330812454224, |
|
"learning_rate": 7.953702439480468e-05, |
|
"loss": 2.575, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.6707616707616708, |
|
"grad_norm": 1.9068561792373657, |
|
"learning_rate": 7.937282033120825e-05, |
|
"loss": 3.0085, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.6732186732186732, |
|
"grad_norm": 1.890088438987732, |
|
"learning_rate": 7.920813110669445e-05, |
|
"loss": 2.9341, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 0.6756756756756757, |
|
"grad_norm": 1.8849834203720093, |
|
"learning_rate": 7.904295944149157e-05, |
|
"loss": 2.7635, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.6781326781326781, |
|
"grad_norm": 1.7430170774459839, |
|
"learning_rate": 7.887730806379641e-05, |
|
"loss": 2.608, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.6805896805896806, |
|
"grad_norm": 1.7541913986206055, |
|
"learning_rate": 7.871117970972948e-05, |
|
"loss": 3.0008, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 0.683046683046683, |
|
"grad_norm": 1.8100526332855225, |
|
"learning_rate": 7.854457712328957e-05, |
|
"loss": 2.8934, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 0.6855036855036855, |
|
"grad_norm": 1.8675240278244019, |
|
"learning_rate": 7.837750305630862e-05, |
|
"loss": 2.7809, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 0.687960687960688, |
|
"grad_norm": 2.3414931297302246, |
|
"learning_rate": 7.820996026840607e-05, |
|
"loss": 2.8065, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.6904176904176904, |
|
"grad_norm": 1.6343646049499512, |
|
"learning_rate": 7.804195152694347e-05, |
|
"loss": 2.9003, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 0.6928746928746928, |
|
"grad_norm": 1.7469531297683716, |
|
"learning_rate": 7.787347960697863e-05, |
|
"loss": 2.8534, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 0.6953316953316954, |
|
"grad_norm": 1.7567309141159058, |
|
"learning_rate": 7.77045472912199e-05, |
|
"loss": 2.7476, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 0.6977886977886978, |
|
"grad_norm": 1.7307237386703491, |
|
"learning_rate": 7.753515736998007e-05, |
|
"loss": 2.7558, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 0.7002457002457002, |
|
"grad_norm": 1.689818263053894, |
|
"learning_rate": 7.736531264113041e-05, |
|
"loss": 2.5798, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.7027027027027027, |
|
"grad_norm": 1.806868553161621, |
|
"learning_rate": 7.719501591005436e-05, |
|
"loss": 2.9242, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 0.7051597051597052, |
|
"grad_norm": 1.8649382591247559, |
|
"learning_rate": 7.702426998960129e-05, |
|
"loss": 2.7499, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 0.7076167076167076, |
|
"grad_norm": 1.8912419080734253, |
|
"learning_rate": 7.685307770003993e-05, |
|
"loss": 2.9558, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.7100737100737101, |
|
"grad_norm": 1.8979637622833252, |
|
"learning_rate": 7.668144186901189e-05, |
|
"loss": 2.7551, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 0.7125307125307125, |
|
"grad_norm": 1.737945556640625, |
|
"learning_rate": 7.650936533148485e-05, |
|
"loss": 2.9113, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.714987714987715, |
|
"grad_norm": 1.8241580724716187, |
|
"learning_rate": 7.633685092970584e-05, |
|
"loss": 2.9001, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 0.7174447174447175, |
|
"grad_norm": 1.7341822385787964, |
|
"learning_rate": 7.616390151315422e-05, |
|
"loss": 2.7347, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 0.7199017199017199, |
|
"grad_norm": 1.8221603631973267, |
|
"learning_rate": 7.599051993849467e-05, |
|
"loss": 2.501, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 0.7223587223587223, |
|
"grad_norm": 1.9198024272918701, |
|
"learning_rate": 7.58167090695299e-05, |
|
"loss": 2.9385, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.7248157248157249, |
|
"grad_norm": 1.725482702255249, |
|
"learning_rate": 7.56424717771535e-05, |
|
"loss": 2.7557, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.7272727272727273, |
|
"grad_norm": 2.0752015113830566, |
|
"learning_rate": 7.546781093930238e-05, |
|
"loss": 2.6973, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 0.7297297297297297, |
|
"grad_norm": 1.84364914894104, |
|
"learning_rate": 7.529272944090935e-05, |
|
"loss": 2.8651, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 0.7321867321867321, |
|
"grad_norm": 2.004852771759033, |
|
"learning_rate": 7.511723017385538e-05, |
|
"loss": 2.6205, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 0.7346437346437347, |
|
"grad_norm": 1.847954273223877, |
|
"learning_rate": 7.494131603692187e-05, |
|
"loss": 2.745, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 0.7371007371007371, |
|
"grad_norm": 1.865380048751831, |
|
"learning_rate": 7.476498993574277e-05, |
|
"loss": 2.8809, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.7395577395577395, |
|
"grad_norm": 1.8404914140701294, |
|
"learning_rate": 7.45882547827566e-05, |
|
"loss": 2.8683, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 0.742014742014742, |
|
"grad_norm": 1.783154010772705, |
|
"learning_rate": 7.441111349715832e-05, |
|
"loss": 2.7588, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 0.7444717444717445, |
|
"grad_norm": 1.7274466753005981, |
|
"learning_rate": 7.423356900485108e-05, |
|
"loss": 2.8013, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 0.7469287469287469, |
|
"grad_norm": 1.9206289052963257, |
|
"learning_rate": 7.405562423839801e-05, |
|
"loss": 2.7951, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 0.7493857493857494, |
|
"grad_norm": 1.9059563875198364, |
|
"learning_rate": 7.387728213697365e-05, |
|
"loss": 2.756, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.7518427518427518, |
|
"grad_norm": 3.3889310359954834, |
|
"learning_rate": 7.369854564631548e-05, |
|
"loss": 2.9342, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 0.7542997542997543, |
|
"grad_norm": 1.733508825302124, |
|
"learning_rate": 7.351941771867523e-05, |
|
"loss": 2.823, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 0.7567567567567568, |
|
"grad_norm": 1.679160475730896, |
|
"learning_rate": 7.333990131277013e-05, |
|
"loss": 2.7185, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.7592137592137592, |
|
"grad_norm": 3.083275318145752, |
|
"learning_rate": 7.315999939373404e-05, |
|
"loss": 2.6941, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 0.7616707616707616, |
|
"grad_norm": 1.7113125324249268, |
|
"learning_rate": 7.297971493306848e-05, |
|
"loss": 2.9749, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.7641277641277642, |
|
"grad_norm": 1.654995083808899, |
|
"learning_rate": 7.279905090859352e-05, |
|
"loss": 2.6857, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 0.7665847665847666, |
|
"grad_norm": 2.0876009464263916, |
|
"learning_rate": 7.261801030439864e-05, |
|
"loss": 2.6532, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 0.769041769041769, |
|
"grad_norm": 1.7576290369033813, |
|
"learning_rate": 7.243659611079343e-05, |
|
"loss": 2.6503, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 0.7714987714987716, |
|
"grad_norm": 1.8806345462799072, |
|
"learning_rate": 7.225481132425812e-05, |
|
"loss": 2.5581, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 0.773955773955774, |
|
"grad_norm": 1.7295315265655518, |
|
"learning_rate": 7.20726589473942e-05, |
|
"loss": 2.7995, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.7764127764127764, |
|
"grad_norm": 1.7444125413894653, |
|
"learning_rate": 7.189014198887478e-05, |
|
"loss": 2.9175, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 0.7788697788697788, |
|
"grad_norm": 1.9849998950958252, |
|
"learning_rate": 7.170726346339488e-05, |
|
"loss": 2.9118, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 0.7813267813267813, |
|
"grad_norm": 1.7984490394592285, |
|
"learning_rate": 7.15240263916216e-05, |
|
"loss": 2.7955, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 0.7837837837837838, |
|
"grad_norm": 1.6878135204315186, |
|
"learning_rate": 7.134043380014436e-05, |
|
"loss": 2.6611, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 0.7862407862407862, |
|
"grad_norm": 1.6732217073440552, |
|
"learning_rate": 7.115648872142475e-05, |
|
"loss": 2.5905, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.7886977886977887, |
|
"grad_norm": 1.8399916887283325, |
|
"learning_rate": 7.097219419374652e-05, |
|
"loss": 2.8581, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 0.7911547911547911, |
|
"grad_norm": 1.6764298677444458, |
|
"learning_rate": 7.078755326116542e-05, |
|
"loss": 2.5396, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 0.7936117936117936, |
|
"grad_norm": 1.882920742034912, |
|
"learning_rate": 7.060256897345888e-05, |
|
"loss": 2.9048, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 0.7960687960687961, |
|
"grad_norm": 1.9298219680786133, |
|
"learning_rate": 7.041724438607563e-05, |
|
"loss": 2.954, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 0.7985257985257985, |
|
"grad_norm": 1.710889458656311, |
|
"learning_rate": 7.023158256008521e-05, |
|
"loss": 2.9693, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.800982800982801, |
|
"grad_norm": 1.7387322187423706, |
|
"learning_rate": 7.004558656212753e-05, |
|
"loss": 2.8788, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 0.8034398034398035, |
|
"grad_norm": 1.8131159543991089, |
|
"learning_rate": 6.985925946436213e-05, |
|
"loss": 2.8383, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 0.8058968058968059, |
|
"grad_norm": 1.7700482606887817, |
|
"learning_rate": 6.967260434441729e-05, |
|
"loss": 2.7677, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 0.8083538083538083, |
|
"grad_norm": 1.7338896989822388, |
|
"learning_rate": 6.948562428533955e-05, |
|
"loss": 2.8565, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 0.8108108108108109, |
|
"grad_norm": 1.7637079954147339, |
|
"learning_rate": 6.929832237554241e-05, |
|
"loss": 2.8684, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.8132678132678133, |
|
"grad_norm": 1.7479512691497803, |
|
"learning_rate": 6.911070170875562e-05, |
|
"loss": 2.8784, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 0.8157248157248157, |
|
"grad_norm": 2.2720673084259033, |
|
"learning_rate": 6.892276538397384e-05, |
|
"loss": 2.7932, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 0.8181818181818182, |
|
"grad_norm": 1.7335164546966553, |
|
"learning_rate": 6.873451650540566e-05, |
|
"loss": 2.6538, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 0.8206388206388207, |
|
"grad_norm": 1.8123350143432617, |
|
"learning_rate": 6.854595818242213e-05, |
|
"loss": 2.8612, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 0.8230958230958231, |
|
"grad_norm": 1.6376591920852661, |
|
"learning_rate": 6.835709352950557e-05, |
|
"loss": 2.8271, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.8255528255528255, |
|
"grad_norm": 1.863923192024231, |
|
"learning_rate": 6.816792566619806e-05, |
|
"loss": 2.6843, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 0.828009828009828, |
|
"grad_norm": 1.637650728225708, |
|
"learning_rate": 6.797845771704983e-05, |
|
"loss": 2.7099, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 0.8304668304668305, |
|
"grad_norm": 1.7621389627456665, |
|
"learning_rate": 6.778869281156784e-05, |
|
"loss": 2.8904, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 0.8329238329238329, |
|
"grad_norm": 1.6948051452636719, |
|
"learning_rate": 6.759863408416386e-05, |
|
"loss": 2.7876, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 0.8353808353808354, |
|
"grad_norm": 1.742684006690979, |
|
"learning_rate": 6.740828467410294e-05, |
|
"loss": 2.9384, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.8378378378378378, |
|
"grad_norm": 1.7781345844268799, |
|
"learning_rate": 6.721764772545135e-05, |
|
"loss": 2.5972, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 0.8402948402948403, |
|
"grad_norm": 1.7659155130386353, |
|
"learning_rate": 6.702672638702475e-05, |
|
"loss": 2.8173, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 0.8427518427518428, |
|
"grad_norm": 1.7547752857208252, |
|
"learning_rate": 6.68355238123362e-05, |
|
"loss": 2.8518, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 0.8452088452088452, |
|
"grad_norm": 1.634647250175476, |
|
"learning_rate": 6.664404315954397e-05, |
|
"loss": 2.7208, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 0.8476658476658476, |
|
"grad_norm": 1.6761173009872437, |
|
"learning_rate": 6.64522875913995e-05, |
|
"loss": 2.812, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.8501228501228502, |
|
"grad_norm": 1.7231212854385376, |
|
"learning_rate": 6.626026027519509e-05, |
|
"loss": 2.7787, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 0.8525798525798526, |
|
"grad_norm": 1.8298566341400146, |
|
"learning_rate": 6.606796438271156e-05, |
|
"loss": 2.755, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 0.855036855036855, |
|
"grad_norm": 1.7021121978759766, |
|
"learning_rate": 6.587540309016592e-05, |
|
"loss": 2.5943, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 0.8574938574938575, |
|
"grad_norm": 1.7043418884277344, |
|
"learning_rate": 6.568257957815893e-05, |
|
"loss": 2.7609, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 0.85995085995086, |
|
"grad_norm": 1.7982568740844727, |
|
"learning_rate": 6.54894970316224e-05, |
|
"loss": 2.5406, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.8624078624078624, |
|
"grad_norm": 1.8539308309555054, |
|
"learning_rate": 6.529615863976684e-05, |
|
"loss": 2.8071, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 0.8648648648648649, |
|
"grad_norm": 1.7513319253921509, |
|
"learning_rate": 6.510256759602857e-05, |
|
"loss": 2.7662, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 0.8673218673218673, |
|
"grad_norm": 1.7730543613433838, |
|
"learning_rate": 6.4908727098017e-05, |
|
"loss": 2.8082, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 0.8697788697788698, |
|
"grad_norm": 1.6426489353179932, |
|
"learning_rate": 6.4714640347462e-05, |
|
"loss": 2.5879, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 0.8722358722358723, |
|
"grad_norm": 1.6702338457107544, |
|
"learning_rate": 6.452031055016073e-05, |
|
"loss": 2.6247, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.8746928746928747, |
|
"grad_norm": 1.6281359195709229, |
|
"learning_rate": 6.432574091592494e-05, |
|
"loss": 2.6192, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 0.8771498771498771, |
|
"grad_norm": 1.7311663627624512, |
|
"learning_rate": 6.41309346585278e-05, |
|
"loss": 2.916, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 0.8796068796068796, |
|
"grad_norm": 1.781010627746582, |
|
"learning_rate": 6.393589499565088e-05, |
|
"loss": 2.6185, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 0.8820638820638821, |
|
"grad_norm": 1.6897130012512207, |
|
"learning_rate": 6.374062514883099e-05, |
|
"loss": 2.8202, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 0.8845208845208845, |
|
"grad_norm": 1.6871286630630493, |
|
"learning_rate": 6.354512834340695e-05, |
|
"loss": 2.6197, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.8869778869778869, |
|
"grad_norm": 1.7017685174942017, |
|
"learning_rate": 6.334940780846634e-05, |
|
"loss": 2.6785, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 0.8894348894348895, |
|
"grad_norm": 1.6969901323318481, |
|
"learning_rate": 6.315346677679218e-05, |
|
"loss": 2.8976, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 0.8918918918918919, |
|
"grad_norm": 1.8819012641906738, |
|
"learning_rate": 6.295730848480947e-05, |
|
"loss": 2.7231, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 0.8943488943488943, |
|
"grad_norm": 1.8459581136703491, |
|
"learning_rate": 6.276093617253182e-05, |
|
"loss": 2.7186, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 0.8968058968058968, |
|
"grad_norm": 1.7170779705047607, |
|
"learning_rate": 6.256435308350786e-05, |
|
"loss": 2.6779, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.8992628992628993, |
|
"grad_norm": 1.7795841693878174, |
|
"learning_rate": 6.236756246476765e-05, |
|
"loss": 2.8444, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 0.9017199017199017, |
|
"grad_norm": 1.7305338382720947, |
|
"learning_rate": 6.217056756676917e-05, |
|
"loss": 2.5758, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 0.9041769041769042, |
|
"grad_norm": 1.757850170135498, |
|
"learning_rate": 6.197337164334453e-05, |
|
"loss": 2.8906, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 0.9066339066339066, |
|
"grad_norm": 1.7602792978286743, |
|
"learning_rate": 6.177597795164616e-05, |
|
"loss": 2.6525, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"grad_norm": 1.6600172519683838, |
|
"learning_rate": 6.157838975209323e-05, |
|
"loss": 2.5314, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.9115479115479116, |
|
"grad_norm": 1.699733018875122, |
|
"learning_rate": 6.138061030831755e-05, |
|
"loss": 2.8172, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 0.914004914004914, |
|
"grad_norm": 1.6197196245193481, |
|
"learning_rate": 6.118264288710988e-05, |
|
"loss": 2.7426, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 0.9164619164619164, |
|
"grad_norm": 1.6376876831054688, |
|
"learning_rate": 6.098449075836575e-05, |
|
"loss": 2.7575, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 0.918918918918919, |
|
"grad_norm": 1.6010876893997192, |
|
"learning_rate": 6.0786157195031653e-05, |
|
"loss": 2.6253, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 0.9213759213759214, |
|
"grad_norm": 1.6927965879440308, |
|
"learning_rate": 6.058764547305088e-05, |
|
"loss": 2.8049, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.9238329238329238, |
|
"grad_norm": 1.682666540145874, |
|
"learning_rate": 6.038895887130942e-05, |
|
"loss": 2.8196, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 0.9262899262899262, |
|
"grad_norm": 1.8771963119506836, |
|
"learning_rate": 6.019010067158181e-05, |
|
"loss": 2.8902, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 0.9287469287469288, |
|
"grad_norm": 1.6778903007507324, |
|
"learning_rate": 5.9991074158476935e-05, |
|
"loss": 2.7642, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 0.9312039312039312, |
|
"grad_norm": 2.0854263305664062, |
|
"learning_rate": 5.9791882619383766e-05, |
|
"loss": 2.727, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 0.9336609336609336, |
|
"grad_norm": 1.6467037200927734, |
|
"learning_rate": 5.959252934441707e-05, |
|
"loss": 2.5711, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.9361179361179361, |
|
"grad_norm": 1.6560814380645752, |
|
"learning_rate": 5.939301762636307e-05, |
|
"loss": 2.8803, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 0.9385749385749386, |
|
"grad_norm": 1.6084171533584595, |
|
"learning_rate": 5.9193350760625014e-05, |
|
"loss": 2.6728, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 0.941031941031941, |
|
"grad_norm": 1.7624053955078125, |
|
"learning_rate": 5.8993532045168795e-05, |
|
"loss": 2.621, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 0.9434889434889435, |
|
"grad_norm": 1.6158403158187866, |
|
"learning_rate": 5.879356478046849e-05, |
|
"loss": 2.7241, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 0.9459459459459459, |
|
"grad_norm": 1.5942398309707642, |
|
"learning_rate": 5.8593452269451775e-05, |
|
"loss": 2.6864, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.9484029484029484, |
|
"grad_norm": 1.6023515462875366, |
|
"learning_rate": 5.839319781744542e-05, |
|
"loss": 2.8454, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 0.9508599508599509, |
|
"grad_norm": 1.6624382734298706, |
|
"learning_rate": 5.81928047321207e-05, |
|
"loss": 2.6242, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 0.9533169533169533, |
|
"grad_norm": 1.638469934463501, |
|
"learning_rate": 5.79922763234387e-05, |
|
"loss": 2.599, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 0.9557739557739557, |
|
"grad_norm": 1.7188334465026855, |
|
"learning_rate": 5.779161590359573e-05, |
|
"loss": 2.7079, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 0.9582309582309583, |
|
"grad_norm": 1.6725521087646484, |
|
"learning_rate": 5.7590826786968576e-05, |
|
"loss": 2.6391, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.9606879606879607, |
|
"grad_norm": 1.6070250272750854, |
|
"learning_rate": 5.738991229005972e-05, |
|
"loss": 2.5252, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 0.9631449631449631, |
|
"grad_norm": 1.8139675855636597, |
|
"learning_rate": 5.7188875731442605e-05, |
|
"loss": 2.545, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 0.9656019656019657, |
|
"grad_norm": 1.6884874105453491, |
|
"learning_rate": 5.6987720431706826e-05, |
|
"loss": 2.5958, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 0.9680589680589681, |
|
"grad_norm": 1.735592246055603, |
|
"learning_rate": 5.678644971340326e-05, |
|
"loss": 2.9243, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 0.9705159705159705, |
|
"grad_norm": 1.7505390644073486, |
|
"learning_rate": 5.658506690098916e-05, |
|
"loss": 2.782, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.972972972972973, |
|
"grad_norm": 1.6411902904510498, |
|
"learning_rate": 5.638357532077331e-05, |
|
"loss": 2.5277, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 0.9754299754299754, |
|
"grad_norm": 1.745453119277954, |
|
"learning_rate": 5.6181978300861046e-05, |
|
"loss": 2.4979, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 0.9778869778869779, |
|
"grad_norm": 1.6847970485687256, |
|
"learning_rate": 5.598027917109929e-05, |
|
"loss": 2.6112, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 0.9803439803439803, |
|
"grad_norm": 1.7334604263305664, |
|
"learning_rate": 5.577848126302152e-05, |
|
"loss": 2.8999, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 0.9828009828009828, |
|
"grad_norm": 1.6627264022827148, |
|
"learning_rate": 5.55765879097928e-05, |
|
"loss": 2.7808, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.9852579852579852, |
|
"grad_norm": 1.8199021816253662, |
|
"learning_rate": 5.5374602446154665e-05, |
|
"loss": 2.846, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 0.9877149877149877, |
|
"grad_norm": 1.6511715650558472, |
|
"learning_rate": 5.517252820837011e-05, |
|
"loss": 2.518, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 0.9901719901719902, |
|
"grad_norm": 1.7161328792572021, |
|
"learning_rate": 5.49703685341684e-05, |
|
"loss": 2.7736, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 0.9926289926289926, |
|
"grad_norm": 1.740677833557129, |
|
"learning_rate": 5.4768126762690034e-05, |
|
"loss": 2.6334, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 0.995085995085995, |
|
"grad_norm": 1.5712820291519165, |
|
"learning_rate": 5.456580623443145e-05, |
|
"loss": 2.6854, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.9975429975429976, |
|
"grad_norm": 1.6212587356567383, |
|
"learning_rate": 5.436341029119004e-05, |
|
"loss": 2.7081, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 5.0476884841918945, |
|
"learning_rate": 5.416094227600881e-05, |
|
"loss": 3.0235, |
|
"step": 814 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 1628, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 814, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.3763952104177664e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|