|
{ |
|
"best_metric": 0.4462641775608063, |
|
"best_model_checkpoint": "cbb-3b/checkpoint-366", |
|
"epoch": 0.9993174061433447, |
|
"eval_steps": 500, |
|
"global_step": 366, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0027303754266211604, |
|
"grad_norm": 0.7549694776535034, |
|
"learning_rate": 1.360544217687075e-06, |
|
"loss": 1.2225, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.005460750853242321, |
|
"grad_norm": 0.7538214325904846, |
|
"learning_rate": 2.72108843537415e-06, |
|
"loss": 1.2103, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.008191126279863481, |
|
"grad_norm": 0.7328954935073853, |
|
"learning_rate": 4.081632653061224e-06, |
|
"loss": 1.1858, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.010921501706484642, |
|
"grad_norm": 0.7359272837638855, |
|
"learning_rate": 5.4421768707483e-06, |
|
"loss": 1.1885, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.013651877133105802, |
|
"grad_norm": 0.740386426448822, |
|
"learning_rate": 6.802721088435375e-06, |
|
"loss": 1.1781, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.016382252559726963, |
|
"grad_norm": 0.6984951496124268, |
|
"learning_rate": 8.163265306122448e-06, |
|
"loss": 1.1395, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.01911262798634812, |
|
"grad_norm": 0.6689624786376953, |
|
"learning_rate": 9.523809523809523e-06, |
|
"loss": 1.137, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.021843003412969283, |
|
"grad_norm": 0.6134174466133118, |
|
"learning_rate": 1.08843537414966e-05, |
|
"loss": 1.1531, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.024573378839590442, |
|
"grad_norm": 0.5647606253623962, |
|
"learning_rate": 1.2244897959183674e-05, |
|
"loss": 1.1201, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.027303754266211604, |
|
"grad_norm": 0.541833221912384, |
|
"learning_rate": 1.360544217687075e-05, |
|
"loss": 1.0989, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.030034129692832763, |
|
"grad_norm": 0.4785626232624054, |
|
"learning_rate": 1.4965986394557824e-05, |
|
"loss": 1.0664, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.032764505119453925, |
|
"grad_norm": 0.42421552538871765, |
|
"learning_rate": 1.6326530612244897e-05, |
|
"loss": 1.057, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.03549488054607509, |
|
"grad_norm": 0.384870707988739, |
|
"learning_rate": 1.7687074829931973e-05, |
|
"loss": 0.9794, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.03822525597269624, |
|
"grad_norm": 0.31449463963508606, |
|
"learning_rate": 1.9047619047619046e-05, |
|
"loss": 0.9485, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.040955631399317405, |
|
"grad_norm": 0.29094135761260986, |
|
"learning_rate": 2.0408163265306123e-05, |
|
"loss": 0.9581, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.04368600682593857, |
|
"grad_norm": 0.2500893771648407, |
|
"learning_rate": 2.17687074829932e-05, |
|
"loss": 0.9363, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.04641638225255973, |
|
"grad_norm": 0.2445881962776184, |
|
"learning_rate": 2.3129251700680275e-05, |
|
"loss": 0.9186, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.049146757679180884, |
|
"grad_norm": 0.2477860301733017, |
|
"learning_rate": 2.448979591836735e-05, |
|
"loss": 0.9099, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.05187713310580205, |
|
"grad_norm": 0.24853268265724182, |
|
"learning_rate": 2.5850340136054425e-05, |
|
"loss": 0.912, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.05460750853242321, |
|
"grad_norm": 0.22501873970031738, |
|
"learning_rate": 2.72108843537415e-05, |
|
"loss": 0.8836, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05733788395904437, |
|
"grad_norm": 0.21223071217536926, |
|
"learning_rate": 2.857142857142857e-05, |
|
"loss": 0.8651, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.060068259385665526, |
|
"grad_norm": 0.20172430574893951, |
|
"learning_rate": 2.9931972789115647e-05, |
|
"loss": 0.8393, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.06279863481228669, |
|
"grad_norm": 0.17902718484401703, |
|
"learning_rate": 3.1292517006802724e-05, |
|
"loss": 0.8033, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.06552901023890785, |
|
"grad_norm": 0.1813097447156906, |
|
"learning_rate": 3.265306122448979e-05, |
|
"loss": 0.8152, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.06825938566552901, |
|
"grad_norm": 0.19280143082141876, |
|
"learning_rate": 3.401360544217687e-05, |
|
"loss": 0.8051, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.07098976109215017, |
|
"grad_norm": 0.17157189548015594, |
|
"learning_rate": 3.5374149659863946e-05, |
|
"loss": 0.794, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.07372013651877134, |
|
"grad_norm": 0.1467738002538681, |
|
"learning_rate": 3.673469387755102e-05, |
|
"loss": 0.7874, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.07645051194539249, |
|
"grad_norm": 0.13913457095623016, |
|
"learning_rate": 3.809523809523809e-05, |
|
"loss": 0.7519, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.07918088737201365, |
|
"grad_norm": 0.13179022073745728, |
|
"learning_rate": 3.945578231292517e-05, |
|
"loss": 0.76, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.08191126279863481, |
|
"grad_norm": 0.1376553773880005, |
|
"learning_rate": 4.0816326530612245e-05, |
|
"loss": 0.7369, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.08464163822525597, |
|
"grad_norm": 0.14040575921535492, |
|
"learning_rate": 4.217687074829932e-05, |
|
"loss": 0.7463, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.08737201365187713, |
|
"grad_norm": 0.13217338919639587, |
|
"learning_rate": 4.35374149659864e-05, |
|
"loss": 0.7298, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.0901023890784983, |
|
"grad_norm": 0.11285194754600525, |
|
"learning_rate": 4.4897959183673474e-05, |
|
"loss": 0.7134, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.09283276450511946, |
|
"grad_norm": 0.10098642110824585, |
|
"learning_rate": 4.625850340136055e-05, |
|
"loss": 0.7238, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.09556313993174062, |
|
"grad_norm": 0.10341370850801468, |
|
"learning_rate": 4.761904761904762e-05, |
|
"loss": 0.6908, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.09829351535836177, |
|
"grad_norm": 0.09662918746471405, |
|
"learning_rate": 4.89795918367347e-05, |
|
"loss": 0.7, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.10102389078498293, |
|
"grad_norm": 0.09548471122980118, |
|
"learning_rate": 5.034013605442177e-05, |
|
"loss": 0.7207, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.1037542662116041, |
|
"grad_norm": 0.09512269496917725, |
|
"learning_rate": 5.170068027210885e-05, |
|
"loss": 0.7016, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.10648464163822526, |
|
"grad_norm": 0.0912129282951355, |
|
"learning_rate": 5.3061224489795926e-05, |
|
"loss": 0.6891, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.10921501706484642, |
|
"grad_norm": 0.08661182224750519, |
|
"learning_rate": 5.4421768707483e-05, |
|
"loss": 0.6982, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.11194539249146758, |
|
"grad_norm": 0.09124922007322311, |
|
"learning_rate": 5.5782312925170065e-05, |
|
"loss": 0.7051, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.11467576791808874, |
|
"grad_norm": 0.09174500405788422, |
|
"learning_rate": 5.714285714285714e-05, |
|
"loss": 0.6978, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.1174061433447099, |
|
"grad_norm": 0.0679943636059761, |
|
"learning_rate": 5.850340136054422e-05, |
|
"loss": 0.6889, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.12013651877133105, |
|
"grad_norm": 0.07204238325357437, |
|
"learning_rate": 5.9863945578231295e-05, |
|
"loss": 0.704, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.12286689419795221, |
|
"grad_norm": 0.08089234679937363, |
|
"learning_rate": 6.122448979591838e-05, |
|
"loss": 0.6838, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.12559726962457338, |
|
"grad_norm": 0.09053023904561996, |
|
"learning_rate": 6.258503401360545e-05, |
|
"loss": 0.6754, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.12832764505119454, |
|
"grad_norm": 0.07513958215713501, |
|
"learning_rate": 6.394557823129253e-05, |
|
"loss": 0.6894, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.1310580204778157, |
|
"grad_norm": 0.07480401545763016, |
|
"learning_rate": 6.530612244897959e-05, |
|
"loss": 0.6809, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.13378839590443686, |
|
"grad_norm": 0.07617643475532532, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 0.697, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.13651877133105803, |
|
"grad_norm": 0.06744271516799927, |
|
"learning_rate": 6.802721088435374e-05, |
|
"loss": 0.6921, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1392491467576792, |
|
"grad_norm": 0.07185206562280655, |
|
"learning_rate": 6.938775510204082e-05, |
|
"loss": 0.6536, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.14197952218430035, |
|
"grad_norm": 0.07255382090806961, |
|
"learning_rate": 7.074829931972789e-05, |
|
"loss": 0.653, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.1447098976109215, |
|
"grad_norm": 0.07474930584430695, |
|
"learning_rate": 7.210884353741498e-05, |
|
"loss": 0.6888, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.14744027303754267, |
|
"grad_norm": 0.0754467323422432, |
|
"learning_rate": 7.346938775510205e-05, |
|
"loss": 0.6818, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.15017064846416384, |
|
"grad_norm": 0.07726683467626572, |
|
"learning_rate": 7.482993197278913e-05, |
|
"loss": 0.6835, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.15290102389078497, |
|
"grad_norm": 0.07462974637746811, |
|
"learning_rate": 7.619047619047618e-05, |
|
"loss": 0.667, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.15563139931740613, |
|
"grad_norm": 0.06939647346735, |
|
"learning_rate": 7.755102040816327e-05, |
|
"loss": 0.6668, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.1583617747440273, |
|
"grad_norm": 0.08218149840831757, |
|
"learning_rate": 7.891156462585034e-05, |
|
"loss": 0.6762, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.16109215017064846, |
|
"grad_norm": 0.0838819146156311, |
|
"learning_rate": 8.027210884353742e-05, |
|
"loss": 0.6685, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.16382252559726962, |
|
"grad_norm": 0.07441603392362595, |
|
"learning_rate": 8.163265306122449e-05, |
|
"loss": 0.6573, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.16655290102389078, |
|
"grad_norm": 0.0746053010225296, |
|
"learning_rate": 8.299319727891157e-05, |
|
"loss": 0.6582, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.16928327645051194, |
|
"grad_norm": 0.08602144569158554, |
|
"learning_rate": 8.435374149659864e-05, |
|
"loss": 0.6547, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.1720136518771331, |
|
"grad_norm": 0.08236663043498993, |
|
"learning_rate": 8.571428571428571e-05, |
|
"loss": 0.6081, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.17474402730375427, |
|
"grad_norm": 0.08744888752698898, |
|
"learning_rate": 8.70748299319728e-05, |
|
"loss": 0.6576, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.17747440273037543, |
|
"grad_norm": 0.08321461081504822, |
|
"learning_rate": 8.843537414965987e-05, |
|
"loss": 0.6137, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.1802047781569966, |
|
"grad_norm": 0.08639347553253174, |
|
"learning_rate": 8.979591836734695e-05, |
|
"loss": 0.6579, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.18293515358361775, |
|
"grad_norm": 0.09154847264289856, |
|
"learning_rate": 9.115646258503402e-05, |
|
"loss": 0.6391, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.18566552901023892, |
|
"grad_norm": 0.1094379723072052, |
|
"learning_rate": 9.25170068027211e-05, |
|
"loss": 0.61, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.18839590443686008, |
|
"grad_norm": 0.11089900881052017, |
|
"learning_rate": 9.387755102040817e-05, |
|
"loss": 0.6452, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.19112627986348124, |
|
"grad_norm": 0.11615785956382751, |
|
"learning_rate": 9.523809523809524e-05, |
|
"loss": 0.6463, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.19385665529010238, |
|
"grad_norm": 0.08359086513519287, |
|
"learning_rate": 9.659863945578231e-05, |
|
"loss": 0.6364, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.19658703071672354, |
|
"grad_norm": 0.0885363295674324, |
|
"learning_rate": 9.79591836734694e-05, |
|
"loss": 0.6092, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.1993174061433447, |
|
"grad_norm": 0.09258115291595459, |
|
"learning_rate": 9.931972789115646e-05, |
|
"loss": 0.6229, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.20204778156996586, |
|
"grad_norm": 0.08969170600175858, |
|
"learning_rate": 0.00010068027210884355, |
|
"loss": 0.6173, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.20477815699658702, |
|
"grad_norm": 0.10124260932207108, |
|
"learning_rate": 0.00010204081632653062, |
|
"loss": 0.6414, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.2075085324232082, |
|
"grad_norm": 0.08671349287033081, |
|
"learning_rate": 0.0001034013605442177, |
|
"loss": 0.6145, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.21023890784982935, |
|
"grad_norm": 0.09684890508651733, |
|
"learning_rate": 0.00010476190476190477, |
|
"loss": 0.6262, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.2129692832764505, |
|
"grad_norm": 0.08690830320119858, |
|
"learning_rate": 0.00010612244897959185, |
|
"loss": 0.6316, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.21569965870307167, |
|
"grad_norm": 0.10457205027341843, |
|
"learning_rate": 0.00010748299319727892, |
|
"loss": 0.639, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.21843003412969283, |
|
"grad_norm": 0.10080841183662415, |
|
"learning_rate": 0.000108843537414966, |
|
"loss": 0.592, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.221160409556314, |
|
"grad_norm": 0.08858262002468109, |
|
"learning_rate": 0.00011020408163265306, |
|
"loss": 0.6471, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.22389078498293516, |
|
"grad_norm": 0.08708172291517258, |
|
"learning_rate": 0.00011156462585034013, |
|
"loss": 0.6222, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.22662116040955632, |
|
"grad_norm": 0.1075206995010376, |
|
"learning_rate": 0.00011292517006802721, |
|
"loss": 0.5961, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.22935153583617748, |
|
"grad_norm": 0.11788732558488846, |
|
"learning_rate": 0.00011428571428571428, |
|
"loss": 0.609, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.23208191126279865, |
|
"grad_norm": 0.0956830084323883, |
|
"learning_rate": 0.00011564625850340137, |
|
"loss": 0.6042, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.2348122866894198, |
|
"grad_norm": 0.09799174964427948, |
|
"learning_rate": 0.00011700680272108844, |
|
"loss": 0.6045, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.23754266211604094, |
|
"grad_norm": 0.09177012741565704, |
|
"learning_rate": 0.00011836734693877552, |
|
"loss": 0.6068, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.2402730375426621, |
|
"grad_norm": 0.10407502949237823, |
|
"learning_rate": 0.00011972789115646259, |
|
"loss": 0.5993, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.24300341296928327, |
|
"grad_norm": 0.1047271341085434, |
|
"learning_rate": 0.00012108843537414967, |
|
"loss": 0.6144, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.24573378839590443, |
|
"grad_norm": 0.0866198018193245, |
|
"learning_rate": 0.00012244897959183676, |
|
"loss": 0.6203, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2484641638225256, |
|
"grad_norm": 0.09400323033332825, |
|
"learning_rate": 0.0001238095238095238, |
|
"loss": 0.6056, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.25119453924914675, |
|
"grad_norm": 0.0817628726363182, |
|
"learning_rate": 0.0001251700680272109, |
|
"loss": 0.5853, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.25392491467576794, |
|
"grad_norm": 0.09105788916349411, |
|
"learning_rate": 0.00012653061224489798, |
|
"loss": 0.5952, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.2566552901023891, |
|
"grad_norm": 0.09889201074838638, |
|
"learning_rate": 0.00012789115646258506, |
|
"loss": 0.5994, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.2593856655290102, |
|
"grad_norm": 0.09481444954872131, |
|
"learning_rate": 0.00012925170068027212, |
|
"loss": 0.5918, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.2621160409556314, |
|
"grad_norm": 0.11730329692363739, |
|
"learning_rate": 0.00013061224489795917, |
|
"loss": 0.592, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.26484641638225254, |
|
"grad_norm": 0.15733356773853302, |
|
"learning_rate": 0.00013197278911564626, |
|
"loss": 0.5636, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.2675767918088737, |
|
"grad_norm": 0.20819880068302155, |
|
"learning_rate": 0.00013333333333333334, |
|
"loss": 0.6101, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.27030716723549486, |
|
"grad_norm": 0.18305541574954987, |
|
"learning_rate": 0.0001346938775510204, |
|
"loss": 0.5814, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.27303754266211605, |
|
"grad_norm": 0.10316050797700882, |
|
"learning_rate": 0.00013605442176870748, |
|
"loss": 0.5871, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2757679180887372, |
|
"grad_norm": 0.13305549323558807, |
|
"learning_rate": 0.00013741496598639456, |
|
"loss": 0.5846, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.2784982935153584, |
|
"grad_norm": 0.0950811356306076, |
|
"learning_rate": 0.00013877551020408165, |
|
"loss": 0.5711, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.2812286689419795, |
|
"grad_norm": 0.1198628693819046, |
|
"learning_rate": 0.0001401360544217687, |
|
"loss": 0.5914, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.2839590443686007, |
|
"grad_norm": 0.08809541165828705, |
|
"learning_rate": 0.00014149659863945578, |
|
"loss": 0.5872, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.28668941979522183, |
|
"grad_norm": 0.09801067411899567, |
|
"learning_rate": 0.00014285714285714287, |
|
"loss": 0.566, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.289419795221843, |
|
"grad_norm": 0.08766568452119827, |
|
"learning_rate": 0.00014421768707482995, |
|
"loss": 0.5808, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.29215017064846416, |
|
"grad_norm": 0.09133429825305939, |
|
"learning_rate": 0.000145578231292517, |
|
"loss": 0.6037, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.29488054607508535, |
|
"grad_norm": 0.09074072539806366, |
|
"learning_rate": 0.0001469387755102041, |
|
"loss": 0.5897, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.2976109215017065, |
|
"grad_norm": 0.08934789896011353, |
|
"learning_rate": 0.00014829931972789117, |
|
"loss": 0.5998, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.3003412969283277, |
|
"grad_norm": 0.08707176148891449, |
|
"learning_rate": 0.00014965986394557826, |
|
"loss": 0.5762, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3030716723549488, |
|
"grad_norm": 0.0948200449347496, |
|
"learning_rate": 0.0001510204081632653, |
|
"loss": 0.5734, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.30580204778156994, |
|
"grad_norm": 0.08889783173799515, |
|
"learning_rate": 0.00015238095238095237, |
|
"loss": 0.5867, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.30853242320819113, |
|
"grad_norm": 0.08152323961257935, |
|
"learning_rate": 0.00015374149659863945, |
|
"loss": 0.5527, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.31126279863481227, |
|
"grad_norm": 0.09019389748573303, |
|
"learning_rate": 0.00015510204081632654, |
|
"loss": 0.6007, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.31399317406143346, |
|
"grad_norm": 0.08257456868886948, |
|
"learning_rate": 0.00015646258503401362, |
|
"loss": 0.5569, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.3167235494880546, |
|
"grad_norm": 0.08834348618984222, |
|
"learning_rate": 0.00015782312925170067, |
|
"loss": 0.6026, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.3194539249146758, |
|
"grad_norm": 0.08634665608406067, |
|
"learning_rate": 0.00015918367346938776, |
|
"loss": 0.5926, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.3221843003412969, |
|
"grad_norm": 0.07867719978094101, |
|
"learning_rate": 0.00016054421768707484, |
|
"loss": 0.5707, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.3249146757679181, |
|
"grad_norm": 0.09690061956644058, |
|
"learning_rate": 0.00016190476190476192, |
|
"loss": 0.5793, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.32764505119453924, |
|
"grad_norm": 0.08276376128196716, |
|
"learning_rate": 0.00016326530612244898, |
|
"loss": 0.5459, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.33037542662116043, |
|
"grad_norm": 0.09276240319013596, |
|
"learning_rate": 0.00016462585034013606, |
|
"loss": 0.5732, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.33310580204778156, |
|
"grad_norm": 0.0819844901561737, |
|
"learning_rate": 0.00016598639455782315, |
|
"loss": 0.5349, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.33583617747440275, |
|
"grad_norm": 0.08146791905164719, |
|
"learning_rate": 0.00016734693877551023, |
|
"loss": 0.5656, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.3385665529010239, |
|
"grad_norm": 0.0879024788737297, |
|
"learning_rate": 0.00016870748299319729, |
|
"loss": 0.5758, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.3412969283276451, |
|
"grad_norm": 0.07890356332063675, |
|
"learning_rate": 0.00017006802721088434, |
|
"loss": 0.5332, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.3440273037542662, |
|
"grad_norm": 0.10049955546855927, |
|
"learning_rate": 0.00017142857142857143, |
|
"loss": 0.5671, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.34675767918088735, |
|
"grad_norm": 0.09643971920013428, |
|
"learning_rate": 0.0001727891156462585, |
|
"loss": 0.5812, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.34948805460750854, |
|
"grad_norm": 0.08666185289621353, |
|
"learning_rate": 0.0001741496598639456, |
|
"loss": 0.5487, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.35221843003412967, |
|
"grad_norm": 0.1031438484787941, |
|
"learning_rate": 0.00017551020408163265, |
|
"loss": 0.5558, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.35494880546075086, |
|
"grad_norm": 0.09404855966567993, |
|
"learning_rate": 0.00017687074829931973, |
|
"loss": 0.5615, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.357679180887372, |
|
"grad_norm": 0.09127198159694672, |
|
"learning_rate": 0.00017823129251700681, |
|
"loss": 0.5656, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.3604095563139932, |
|
"grad_norm": 0.08694130182266235, |
|
"learning_rate": 0.0001795918367346939, |
|
"loss": 0.5379, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.3631399317406143, |
|
"grad_norm": 0.09511597454547882, |
|
"learning_rate": 0.00018095238095238095, |
|
"loss": 0.5535, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.3658703071672355, |
|
"grad_norm": 0.09129739552736282, |
|
"learning_rate": 0.00018231292517006804, |
|
"loss": 0.5678, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.36860068259385664, |
|
"grad_norm": 0.09248334169387817, |
|
"learning_rate": 0.00018367346938775512, |
|
"loss": 0.5574, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.37133105802047783, |
|
"grad_norm": 0.09906318038702011, |
|
"learning_rate": 0.0001850340136054422, |
|
"loss": 0.5499, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.37406143344709897, |
|
"grad_norm": 0.09928654134273529, |
|
"learning_rate": 0.00018639455782312926, |
|
"loss": 0.5413, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.37679180887372016, |
|
"grad_norm": 0.07559472322463989, |
|
"learning_rate": 0.00018775510204081634, |
|
"loss": 0.5475, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.3795221843003413, |
|
"grad_norm": 0.08408834040164948, |
|
"learning_rate": 0.00018911564625850343, |
|
"loss": 0.5432, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.3822525597269625, |
|
"grad_norm": 0.08800789713859558, |
|
"learning_rate": 0.00019047619047619048, |
|
"loss": 0.5587, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3849829351535836, |
|
"grad_norm": 0.09994784742593765, |
|
"learning_rate": 0.00019183673469387756, |
|
"loss": 0.555, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.38771331058020475, |
|
"grad_norm": 0.07616768032312393, |
|
"learning_rate": 0.00019319727891156462, |
|
"loss": 0.5621, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.39044368600682594, |
|
"grad_norm": 0.10337202996015549, |
|
"learning_rate": 0.0001945578231292517, |
|
"loss": 0.5282, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.3931740614334471, |
|
"grad_norm": 0.08526328206062317, |
|
"learning_rate": 0.0001959183673469388, |
|
"loss": 0.5439, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.39590443686006827, |
|
"grad_norm": 0.10538353770971298, |
|
"learning_rate": 0.00019727891156462587, |
|
"loss": 0.5481, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.3986348122866894, |
|
"grad_norm": 0.07550521194934845, |
|
"learning_rate": 0.00019863945578231293, |
|
"loss": 0.5414, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.4013651877133106, |
|
"grad_norm": 0.10045620799064636, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5382, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.4040955631399317, |
|
"grad_norm": 0.08987366408109665, |
|
"learning_rate": 0.00019999971548969982, |
|
"loss": 0.5417, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.4068259385665529, |
|
"grad_norm": 0.0801815390586853, |
|
"learning_rate": 0.0001999988619604182, |
|
"loss": 0.5275, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.40955631399317405, |
|
"grad_norm": 0.08214934170246124, |
|
"learning_rate": 0.00019999743941701188, |
|
"loss": 0.543, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.41228668941979524, |
|
"grad_norm": 0.08146006613969803, |
|
"learning_rate": 0.00019999544786757545, |
|
"loss": 0.5409, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.4150170648464164, |
|
"grad_norm": 0.08081945031881332, |
|
"learning_rate": 0.00019999288732344122, |
|
"loss": 0.5509, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.41774744027303756, |
|
"grad_norm": 0.09135357290506363, |
|
"learning_rate": 0.0001999897577991792, |
|
"loss": 0.518, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.4204778156996587, |
|
"grad_norm": 0.09191333502531052, |
|
"learning_rate": 0.0001999860593125971, |
|
"loss": 0.5276, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.4232081911262799, |
|
"grad_norm": 0.08375995606184006, |
|
"learning_rate": 0.00019998179188473997, |
|
"loss": 0.5319, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.425938566552901, |
|
"grad_norm": 0.08481922000646591, |
|
"learning_rate": 0.00019997695553989042, |
|
"loss": 0.5437, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.4286689419795222, |
|
"grad_norm": 0.08768640458583832, |
|
"learning_rate": 0.00019997155030556822, |
|
"loss": 0.5445, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.43139931740614335, |
|
"grad_norm": 0.08787625283002853, |
|
"learning_rate": 0.00019996557621253027, |
|
"loss": 0.5479, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.4341296928327645, |
|
"grad_norm": 0.09505843371152878, |
|
"learning_rate": 0.0001999590332947704, |
|
"loss": 0.5263, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.43686006825938567, |
|
"grad_norm": 0.10003377497196198, |
|
"learning_rate": 0.00019995192158951919, |
|
"loss": 0.5228, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4395904436860068, |
|
"grad_norm": 0.0675501748919487, |
|
"learning_rate": 0.00019994424113724363, |
|
"loss": 0.4977, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.442320819112628, |
|
"grad_norm": 0.09747067093849182, |
|
"learning_rate": 0.00019993599198164715, |
|
"loss": 0.5161, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.44505119453924913, |
|
"grad_norm": 0.0837995857000351, |
|
"learning_rate": 0.0001999271741696691, |
|
"loss": 0.5243, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.4477815699658703, |
|
"grad_norm": 0.0793512687087059, |
|
"learning_rate": 0.00019991778775148465, |
|
"loss": 0.5141, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.45051194539249145, |
|
"grad_norm": 0.07802822440862656, |
|
"learning_rate": 0.00019990783278050448, |
|
"loss": 0.515, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.45324232081911264, |
|
"grad_norm": 0.08355724066495895, |
|
"learning_rate": 0.0001998973093133744, |
|
"loss": 0.5176, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.4559726962457338, |
|
"grad_norm": 0.08045308291912079, |
|
"learning_rate": 0.00019988621740997512, |
|
"loss": 0.5151, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.45870307167235497, |
|
"grad_norm": 0.07589907944202423, |
|
"learning_rate": 0.00019987455713342187, |
|
"loss": 0.5249, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.4614334470989761, |
|
"grad_norm": 0.08553771674633026, |
|
"learning_rate": 0.000199862328550064, |
|
"loss": 0.5485, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.4641638225255973, |
|
"grad_norm": 0.08599649369716644, |
|
"learning_rate": 0.00019984953172948465, |
|
"loss": 0.53, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.4668941979522184, |
|
"grad_norm": 0.06906479597091675, |
|
"learning_rate": 0.0001998361667445004, |
|
"loss": 0.5336, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.4696245733788396, |
|
"grad_norm": 0.07526392489671707, |
|
"learning_rate": 0.00019982223367116076, |
|
"loss": 0.5013, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.47235494880546075, |
|
"grad_norm": 0.0722610279917717, |
|
"learning_rate": 0.00019980773258874778, |
|
"loss": 0.5217, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.4750853242320819, |
|
"grad_norm": 0.0773632749915123, |
|
"learning_rate": 0.00019979266357977564, |
|
"loss": 0.5184, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.4778156996587031, |
|
"grad_norm": 0.07160216569900513, |
|
"learning_rate": 0.00019977702672999007, |
|
"loss": 0.5009, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.4805460750853242, |
|
"grad_norm": 0.0764177069067955, |
|
"learning_rate": 0.00019976082212836793, |
|
"loss": 0.5126, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.4832764505119454, |
|
"grad_norm": 0.07116773724555969, |
|
"learning_rate": 0.0001997440498671168, |
|
"loss": 0.514, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.48600682593856653, |
|
"grad_norm": 0.08402683585882187, |
|
"learning_rate": 0.00019972671004167433, |
|
"loss": 0.5133, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.4887372013651877, |
|
"grad_norm": 0.07286666333675385, |
|
"learning_rate": 0.00019970880275070762, |
|
"loss": 0.5221, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.49146757679180886, |
|
"grad_norm": 0.08641263097524643, |
|
"learning_rate": 0.00019969032809611287, |
|
"loss": 0.4959, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.49419795221843005, |
|
"grad_norm": 0.08849737048149109, |
|
"learning_rate": 0.0001996712861830147, |
|
"loss": 0.4952, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.4969283276450512, |
|
"grad_norm": 0.08661802858114243, |
|
"learning_rate": 0.00019965167711976552, |
|
"loss": 0.5023, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.49965870307167237, |
|
"grad_norm": 0.08355259150266647, |
|
"learning_rate": 0.0001996315010179449, |
|
"loss": 0.5235, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.5023890784982935, |
|
"grad_norm": 0.07524804770946503, |
|
"learning_rate": 0.00019961075799235903, |
|
"loss": 0.5143, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.5051194539249146, |
|
"grad_norm": 0.08126044273376465, |
|
"learning_rate": 0.00019958944816104, |
|
"loss": 0.496, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.5078498293515359, |
|
"grad_norm": 0.08320248872041702, |
|
"learning_rate": 0.00019956757164524516, |
|
"loss": 0.5106, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.510580204778157, |
|
"grad_norm": 0.07375509291887283, |
|
"learning_rate": 0.00019954512856945632, |
|
"loss": 0.4811, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.5133105802047782, |
|
"grad_norm": 0.07187776267528534, |
|
"learning_rate": 0.00019952211906137932, |
|
"loss": 0.5104, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.5160409556313993, |
|
"grad_norm": 0.07441398501396179, |
|
"learning_rate": 0.00019949854325194294, |
|
"loss": 0.5304, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.5187713310580204, |
|
"grad_norm": 0.07976701855659485, |
|
"learning_rate": 0.00019947440127529836, |
|
"loss": 0.4945, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5215017064846417, |
|
"grad_norm": 0.07280328124761581, |
|
"learning_rate": 0.00019944969326881845, |
|
"loss": 0.4848, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.5242320819112628, |
|
"grad_norm": 0.07618428766727448, |
|
"learning_rate": 0.00019942441937309684, |
|
"loss": 0.4858, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.5269624573378839, |
|
"grad_norm": 0.0665225088596344, |
|
"learning_rate": 0.00019939857973194717, |
|
"loss": 0.4955, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.5296928327645051, |
|
"grad_norm": 0.08379194140434265, |
|
"learning_rate": 0.0001993721744924024, |
|
"loss": 0.5067, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.5324232081911263, |
|
"grad_norm": 0.07564423978328705, |
|
"learning_rate": 0.00019934520380471372, |
|
"loss": 0.5159, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.5351535836177475, |
|
"grad_norm": 0.07225633412599564, |
|
"learning_rate": 0.0001993176678223499, |
|
"loss": 0.5144, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.5378839590443686, |
|
"grad_norm": 0.07224252074956894, |
|
"learning_rate": 0.0001992895667019964, |
|
"loss": 0.4859, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.5406143344709897, |
|
"grad_norm": 0.079926997423172, |
|
"learning_rate": 0.0001992609006035543, |
|
"loss": 0.4872, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.543344709897611, |
|
"grad_norm": 0.08545151352882385, |
|
"learning_rate": 0.0001992316696901397, |
|
"loss": 0.5105, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.5460750853242321, |
|
"grad_norm": 0.08008193224668503, |
|
"learning_rate": 0.00019920187412808248, |
|
"loss": 0.4903, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5488054607508532, |
|
"grad_norm": 0.06717066466808319, |
|
"learning_rate": 0.0001991715140869255, |
|
"loss": 0.5037, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.5515358361774744, |
|
"grad_norm": 0.08613338321447372, |
|
"learning_rate": 0.00019914058973942368, |
|
"loss": 0.4999, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.5542662116040956, |
|
"grad_norm": 0.07288234680891037, |
|
"learning_rate": 0.00019910910126154293, |
|
"loss": 0.5019, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.5569965870307167, |
|
"grad_norm": 0.07831370085477829, |
|
"learning_rate": 0.00019907704883245916, |
|
"loss": 0.4595, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.5597269624573379, |
|
"grad_norm": 0.0916525200009346, |
|
"learning_rate": 0.00019904443263455728, |
|
"loss": 0.4994, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.562457337883959, |
|
"grad_norm": 0.07431495934724808, |
|
"learning_rate": 0.00019901125285343022, |
|
"loss": 0.5059, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.5651877133105802, |
|
"grad_norm": 0.07864730060100555, |
|
"learning_rate": 0.0001989775096778777, |
|
"loss": 0.4824, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.5679180887372014, |
|
"grad_norm": 0.06928006559610367, |
|
"learning_rate": 0.0001989432032999054, |
|
"loss": 0.4887, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.5706484641638225, |
|
"grad_norm": 0.07330948859453201, |
|
"learning_rate": 0.0001989083339147237, |
|
"loss": 0.4804, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.5733788395904437, |
|
"grad_norm": 0.07905860990285873, |
|
"learning_rate": 0.0001988729017207465, |
|
"loss": 0.5126, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5761092150170648, |
|
"grad_norm": 0.07062509655952454, |
|
"learning_rate": 0.00019883690691959035, |
|
"loss": 0.5063, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.578839590443686, |
|
"grad_norm": 0.071404367685318, |
|
"learning_rate": 0.00019880034971607308, |
|
"loss": 0.495, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.5815699658703072, |
|
"grad_norm": 0.0727284774184227, |
|
"learning_rate": 0.00019876323031821266, |
|
"loss": 0.4994, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.5843003412969283, |
|
"grad_norm": 0.07198608666658401, |
|
"learning_rate": 0.00019872554893722618, |
|
"loss": 0.4903, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.5870307167235495, |
|
"grad_norm": 0.07637451589107513, |
|
"learning_rate": 0.0001986873057875284, |
|
"loss": 0.5057, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.5897610921501707, |
|
"grad_norm": 0.06596951186656952, |
|
"learning_rate": 0.00019864850108673073, |
|
"loss": 0.4932, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.5924914675767918, |
|
"grad_norm": 0.06999579071998596, |
|
"learning_rate": 0.0001986091350556399, |
|
"loss": 0.4887, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.595221843003413, |
|
"grad_norm": 0.06687980890274048, |
|
"learning_rate": 0.00019856920791825683, |
|
"loss": 0.472, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.5979522184300341, |
|
"grad_norm": 0.07001427561044693, |
|
"learning_rate": 0.00019852871990177503, |
|
"loss": 0.4692, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.6006825938566553, |
|
"grad_norm": 0.06714101880788803, |
|
"learning_rate": 0.00019848767123657976, |
|
"loss": 0.4813, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6034129692832765, |
|
"grad_norm": 0.07292049378156662, |
|
"learning_rate": 0.0001984460621562463, |
|
"loss": 0.4885, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.6061433447098976, |
|
"grad_norm": 0.06814104318618774, |
|
"learning_rate": 0.00019840389289753896, |
|
"loss": 0.4938, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.6088737201365187, |
|
"grad_norm": 0.06866355985403061, |
|
"learning_rate": 0.00019836116370040944, |
|
"loss": 0.4776, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.6116040955631399, |
|
"grad_norm": 0.07145702093839645, |
|
"learning_rate": 0.00019831787480799568, |
|
"loss": 0.4883, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.6143344709897611, |
|
"grad_norm": 0.06319977343082428, |
|
"learning_rate": 0.00019827402646662047, |
|
"loss": 0.4882, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.6170648464163823, |
|
"grad_norm": 0.08186688274145126, |
|
"learning_rate": 0.0001982296189257898, |
|
"loss": 0.4917, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.6197952218430034, |
|
"grad_norm": 0.06892900168895721, |
|
"learning_rate": 0.00019818465243819184, |
|
"loss": 0.4808, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.6225255972696245, |
|
"grad_norm": 0.0752168744802475, |
|
"learning_rate": 0.00019813912725969509, |
|
"loss": 0.4858, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.6252559726962458, |
|
"grad_norm": 0.08079662919044495, |
|
"learning_rate": 0.0001980930436493472, |
|
"loss": 0.5101, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.6279863481228669, |
|
"grad_norm": 0.0717153325676918, |
|
"learning_rate": 0.00019804640186937343, |
|
"loss": 0.4799, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.630716723549488, |
|
"grad_norm": 0.08962002396583557, |
|
"learning_rate": 0.0001979992021851751, |
|
"loss": 0.5067, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.6334470989761092, |
|
"grad_norm": 0.08904211223125458, |
|
"learning_rate": 0.00019795144486532814, |
|
"loss": 0.4725, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.6361774744027304, |
|
"grad_norm": 0.06842932850122452, |
|
"learning_rate": 0.00019790313018158156, |
|
"loss": 0.4996, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.6389078498293516, |
|
"grad_norm": 0.08361311256885529, |
|
"learning_rate": 0.0001978542584088558, |
|
"loss": 0.4945, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.6416382252559727, |
|
"grad_norm": 0.07219431549310684, |
|
"learning_rate": 0.00019780482982524142, |
|
"loss": 0.4488, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.6443686006825938, |
|
"grad_norm": 0.07717226445674896, |
|
"learning_rate": 0.00019775484471199715, |
|
"loss": 0.4814, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.647098976109215, |
|
"grad_norm": 0.07770105451345444, |
|
"learning_rate": 0.0001977043033535486, |
|
"loss": 0.4731, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.6498293515358362, |
|
"grad_norm": 0.06878919899463654, |
|
"learning_rate": 0.00019765320603748655, |
|
"loss": 0.4833, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.6525597269624573, |
|
"grad_norm": 0.07085343450307846, |
|
"learning_rate": 0.0001976015530545652, |
|
"loss": 0.4907, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.6552901023890785, |
|
"grad_norm": 0.07935165613889694, |
|
"learning_rate": 0.0001975493446987007, |
|
"loss": 0.4794, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6580204778156996, |
|
"grad_norm": 0.06543820351362228, |
|
"learning_rate": 0.00019749658126696934, |
|
"loss": 0.4906, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.6607508532423209, |
|
"grad_norm": 0.07727054506540298, |
|
"learning_rate": 0.00019744326305960595, |
|
"loss": 0.4868, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.663481228668942, |
|
"grad_norm": 0.06668544560670853, |
|
"learning_rate": 0.00019738939038000205, |
|
"loss": 0.475, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.6662116040955631, |
|
"grad_norm": 0.07048569619655609, |
|
"learning_rate": 0.00019733496353470433, |
|
"loss": 0.4878, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.6689419795221843, |
|
"grad_norm": 0.07110477238893509, |
|
"learning_rate": 0.00019727998283341274, |
|
"loss": 0.4663, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.6716723549488055, |
|
"grad_norm": 0.07245586067438126, |
|
"learning_rate": 0.00019722444858897878, |
|
"loss": 0.4899, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.6744027303754266, |
|
"grad_norm": 0.07484875619411469, |
|
"learning_rate": 0.00019716836111740378, |
|
"loss": 0.4831, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.6771331058020478, |
|
"grad_norm": 0.07812648266553879, |
|
"learning_rate": 0.00019711172073783696, |
|
"loss": 0.4654, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.6798634812286689, |
|
"grad_norm": 0.060632165521383286, |
|
"learning_rate": 0.00019705452777257377, |
|
"loss": 0.4706, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.6825938566552902, |
|
"grad_norm": 0.07092992216348648, |
|
"learning_rate": 0.000196996782547054, |
|
"loss": 0.4792, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6853242320819113, |
|
"grad_norm": 0.06629595905542374, |
|
"learning_rate": 0.00019693848538985983, |
|
"loss": 0.4791, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.6880546075085324, |
|
"grad_norm": 0.06915664672851562, |
|
"learning_rate": 0.00019687963663271409, |
|
"loss": 0.4623, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.6907849829351536, |
|
"grad_norm": 0.0694665014743805, |
|
"learning_rate": 0.00019682023661047836, |
|
"loss": 0.48, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.6935153583617747, |
|
"grad_norm": 0.06899196654558182, |
|
"learning_rate": 0.00019676028566115102, |
|
"loss": 0.4855, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.6962457337883959, |
|
"grad_norm": 0.0740811675786972, |
|
"learning_rate": 0.00019669978412586528, |
|
"loss": 0.4833, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.6989761092150171, |
|
"grad_norm": 0.06517481803894043, |
|
"learning_rate": 0.00019663873234888733, |
|
"loss": 0.4523, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.7017064846416382, |
|
"grad_norm": 0.06481153517961502, |
|
"learning_rate": 0.0001965771306776144, |
|
"loss": 0.4689, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.7044368600682593, |
|
"grad_norm": 0.06042364612221718, |
|
"learning_rate": 0.00019651497946257266, |
|
"loss": 0.4757, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.7071672354948806, |
|
"grad_norm": 0.0717868059873581, |
|
"learning_rate": 0.00019645227905741534, |
|
"loss": 0.4773, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.7098976109215017, |
|
"grad_norm": 0.06427443772554398, |
|
"learning_rate": 0.00019638902981892068, |
|
"loss": 0.4875, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.7126279863481229, |
|
"grad_norm": 0.07786547392606735, |
|
"learning_rate": 0.00019632523210698987, |
|
"loss": 0.4758, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.715358361774744, |
|
"grad_norm": 0.07115910202264786, |
|
"learning_rate": 0.00019626088628464498, |
|
"loss": 0.4651, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.7180887372013652, |
|
"grad_norm": 0.06626811623573303, |
|
"learning_rate": 0.00019619599271802706, |
|
"loss": 0.4873, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.7208191126279864, |
|
"grad_norm": 0.07854583859443665, |
|
"learning_rate": 0.00019613055177639384, |
|
"loss": 0.4945, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.7235494880546075, |
|
"grad_norm": 0.0847892239689827, |
|
"learning_rate": 0.00019606456383211777, |
|
"loss": 0.4671, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.7262798634812286, |
|
"grad_norm": 0.06735772639513016, |
|
"learning_rate": 0.00019599802926068384, |
|
"loss": 0.4767, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.7290102389078499, |
|
"grad_norm": 0.07502768933773041, |
|
"learning_rate": 0.00019593094844068748, |
|
"loss": 0.462, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.731740614334471, |
|
"grad_norm": 0.07276903837919235, |
|
"learning_rate": 0.00019586332175383238, |
|
"loss": 0.4754, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.7344709897610922, |
|
"grad_norm": 0.07755447924137115, |
|
"learning_rate": 0.00019579514958492826, |
|
"loss": 0.492, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.7372013651877133, |
|
"grad_norm": 0.07876396179199219, |
|
"learning_rate": 0.0001957264323218889, |
|
"loss": 0.4737, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.7399317406143344, |
|
"grad_norm": 0.07997962832450867, |
|
"learning_rate": 0.0001956571703557296, |
|
"loss": 0.4592, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.7426621160409557, |
|
"grad_norm": 0.08079583197832108, |
|
"learning_rate": 0.00019558736408056525, |
|
"loss": 0.473, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.7453924914675768, |
|
"grad_norm": 0.0736604854464531, |
|
"learning_rate": 0.00019551701389360795, |
|
"loss": 0.4741, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.7481228668941979, |
|
"grad_norm": 0.0741550549864769, |
|
"learning_rate": 0.00019544612019516472, |
|
"loss": 0.4611, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.7508532423208191, |
|
"grad_norm": 0.06802786141633987, |
|
"learning_rate": 0.00019537468338863537, |
|
"loss": 0.4621, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.7535836177474403, |
|
"grad_norm": 0.06499720364809036, |
|
"learning_rate": 0.00019530270388050998, |
|
"loss": 0.4676, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.7563139931740614, |
|
"grad_norm": 0.06809037923812866, |
|
"learning_rate": 0.00019523018208036677, |
|
"loss": 0.475, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.7590443686006826, |
|
"grad_norm": 0.06455886363983154, |
|
"learning_rate": 0.0001951571184008698, |
|
"loss": 0.4807, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.7617747440273037, |
|
"grad_norm": 0.06833679229021072, |
|
"learning_rate": 0.00019508351325776642, |
|
"loss": 0.4751, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.764505119453925, |
|
"grad_norm": 0.07593976706266403, |
|
"learning_rate": 0.00019500936706988502, |
|
"loss": 0.4714, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7672354948805461, |
|
"grad_norm": 0.0687364712357521, |
|
"learning_rate": 0.00019493468025913276, |
|
"loss": 0.4575, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.7699658703071672, |
|
"grad_norm": 0.07183225452899933, |
|
"learning_rate": 0.00019485945325049288, |
|
"loss": 0.4815, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.7726962457337884, |
|
"grad_norm": 0.06775309145450592, |
|
"learning_rate": 0.00019478368647202264, |
|
"loss": 0.4543, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.7754266211604095, |
|
"grad_norm": 0.06261654198169708, |
|
"learning_rate": 0.00019470738035485058, |
|
"loss": 0.4724, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.7781569965870307, |
|
"grad_norm": 0.06674676388502121, |
|
"learning_rate": 0.00019463053533317425, |
|
"loss": 0.4667, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.7808873720136519, |
|
"grad_norm": 0.06266098469495773, |
|
"learning_rate": 0.0001945531518442576, |
|
"loss": 0.4614, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.783617747440273, |
|
"grad_norm": 0.06769178062677383, |
|
"learning_rate": 0.0001944752303284287, |
|
"loss": 0.4609, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.7863481228668942, |
|
"grad_norm": 0.07618339359760284, |
|
"learning_rate": 0.00019439677122907697, |
|
"loss": 0.4822, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.7890784982935154, |
|
"grad_norm": 0.06216439977288246, |
|
"learning_rate": 0.00019431777499265087, |
|
"loss": 0.4573, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.7918088737201365, |
|
"grad_norm": 0.06998062878847122, |
|
"learning_rate": 0.00019423824206865527, |
|
"loss": 0.4683, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7945392491467577, |
|
"grad_norm": 0.06178448721766472, |
|
"learning_rate": 0.00019415817290964883, |
|
"loss": 0.4643, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.7972696245733788, |
|
"grad_norm": 0.06611185520887375, |
|
"learning_rate": 0.00019407756797124164, |
|
"loss": 0.4712, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.06682468205690384, |
|
"learning_rate": 0.00019399642771209238, |
|
"loss": 0.474, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.8027303754266212, |
|
"grad_norm": 0.0632803738117218, |
|
"learning_rate": 0.00019391475259390584, |
|
"loss": 0.4776, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.8054607508532423, |
|
"grad_norm": 0.06498962640762329, |
|
"learning_rate": 0.0001938325430814302, |
|
"loss": 0.4735, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.8081911262798634, |
|
"grad_norm": 0.06621643900871277, |
|
"learning_rate": 0.00019374979964245463, |
|
"loss": 0.4785, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.8109215017064847, |
|
"grad_norm": 0.05847141519188881, |
|
"learning_rate": 0.00019366652274780628, |
|
"loss": 0.4702, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.8136518771331058, |
|
"grad_norm": 0.06962229311466217, |
|
"learning_rate": 0.00019358271287134784, |
|
"loss": 0.4612, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.816382252559727, |
|
"grad_norm": 0.06132384389638901, |
|
"learning_rate": 0.00019349837048997478, |
|
"loss": 0.4453, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.8191126279863481, |
|
"grad_norm": 0.06574399024248123, |
|
"learning_rate": 0.00019341349608361267, |
|
"loss": 0.4545, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.8218430034129692, |
|
"grad_norm": 0.06561442464590073, |
|
"learning_rate": 0.00019332809013521428, |
|
"loss": 0.4619, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.8245733788395905, |
|
"grad_norm": 0.06309875100851059, |
|
"learning_rate": 0.00019324215313075706, |
|
"loss": 0.465, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.8273037542662116, |
|
"grad_norm": 0.06544878333806992, |
|
"learning_rate": 0.00019315568555924035, |
|
"loss": 0.4571, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.8300341296928327, |
|
"grad_norm": 0.07011238485574722, |
|
"learning_rate": 0.0001930686879126824, |
|
"loss": 0.4579, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.8327645051194539, |
|
"grad_norm": 0.06445574760437012, |
|
"learning_rate": 0.0001929811606861177, |
|
"loss": 0.4695, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.8354948805460751, |
|
"grad_norm": 0.061930734664201736, |
|
"learning_rate": 0.00019289310437759427, |
|
"loss": 0.4449, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.8382252559726963, |
|
"grad_norm": 0.0658838227391243, |
|
"learning_rate": 0.00019280451948817059, |
|
"loss": 0.4726, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.8409556313993174, |
|
"grad_norm": 0.06302706897258759, |
|
"learning_rate": 0.00019271540652191296, |
|
"loss": 0.447, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.8436860068259385, |
|
"grad_norm": 0.08308806270360947, |
|
"learning_rate": 0.0001926257659858925, |
|
"loss": 0.4605, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.8464163822525598, |
|
"grad_norm": 0.06508838385343552, |
|
"learning_rate": 0.00019253559839018235, |
|
"loss": 0.4778, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.8491467576791809, |
|
"grad_norm": 0.07429094612598419, |
|
"learning_rate": 0.00019244490424785468, |
|
"loss": 0.4659, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.851877133105802, |
|
"grad_norm": 0.07138285785913467, |
|
"learning_rate": 0.00019235368407497788, |
|
"loss": 0.4564, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.8546075085324232, |
|
"grad_norm": 0.07202211022377014, |
|
"learning_rate": 0.00019226193839061347, |
|
"loss": 0.4377, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.8573378839590444, |
|
"grad_norm": 0.0779070258140564, |
|
"learning_rate": 0.0001921696677168133, |
|
"loss": 0.4532, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.8600682593856656, |
|
"grad_norm": 0.07717596739530563, |
|
"learning_rate": 0.00019207687257861655, |
|
"loss": 0.4654, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.8627986348122867, |
|
"grad_norm": 0.0708346962928772, |
|
"learning_rate": 0.00019198355350404667, |
|
"loss": 0.4584, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.8655290102389078, |
|
"grad_norm": 0.0656716600060463, |
|
"learning_rate": 0.00019188971102410837, |
|
"loss": 0.4504, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.868259385665529, |
|
"grad_norm": 0.06869971752166748, |
|
"learning_rate": 0.00019179534567278475, |
|
"loss": 0.4592, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.8709897610921502, |
|
"grad_norm": 0.06358928978443146, |
|
"learning_rate": 0.00019170045798703406, |
|
"loss": 0.4376, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.8737201365187713, |
|
"grad_norm": 0.06602993607521057, |
|
"learning_rate": 0.0001916050485067868, |
|
"loss": 0.4692, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8764505119453925, |
|
"grad_norm": 0.06115058436989784, |
|
"learning_rate": 0.00019150911777494258, |
|
"loss": 0.462, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.8791808873720136, |
|
"grad_norm": 0.06374403834342957, |
|
"learning_rate": 0.00019141266633736697, |
|
"loss": 0.4325, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.8819112627986349, |
|
"grad_norm": 0.06459895521402359, |
|
"learning_rate": 0.0001913156947428886, |
|
"loss": 0.4605, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.884641638225256, |
|
"grad_norm": 0.06160016357898712, |
|
"learning_rate": 0.00019121820354329577, |
|
"loss": 0.4604, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.8873720136518771, |
|
"grad_norm": 0.06345291435718536, |
|
"learning_rate": 0.00019112019329333346, |
|
"loss": 0.4565, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.8901023890784983, |
|
"grad_norm": 0.06534894555807114, |
|
"learning_rate": 0.00019102166455070024, |
|
"loss": 0.4619, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.8928327645051195, |
|
"grad_norm": 0.06186550110578537, |
|
"learning_rate": 0.00019092261787604492, |
|
"loss": 0.4477, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.8955631399317406, |
|
"grad_norm": 0.058699868619441986, |
|
"learning_rate": 0.00019082305383296352, |
|
"loss": 0.4484, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.8982935153583618, |
|
"grad_norm": 0.05798410624265671, |
|
"learning_rate": 0.00019072297298799589, |
|
"loss": 0.4605, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.9010238907849829, |
|
"grad_norm": 0.06147664040327072, |
|
"learning_rate": 0.00019062237591062272, |
|
"loss": 0.4489, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.903754266211604, |
|
"grad_norm": 0.06032559648156166, |
|
"learning_rate": 0.00019052126317326207, |
|
"loss": 0.4412, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.9064846416382253, |
|
"grad_norm": 0.06326504051685333, |
|
"learning_rate": 0.00019041963535126625, |
|
"loss": 0.4547, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.9092150170648464, |
|
"grad_norm": 0.06808637827634811, |
|
"learning_rate": 0.0001903174930229185, |
|
"loss": 0.4513, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.9119453924914676, |
|
"grad_norm": 0.06384904682636261, |
|
"learning_rate": 0.00019021483676942973, |
|
"loss": 0.4542, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.9146757679180887, |
|
"grad_norm": 0.07148803770542145, |
|
"learning_rate": 0.00019011166717493517, |
|
"loss": 0.4569, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.9174061433447099, |
|
"grad_norm": 0.06942867487668991, |
|
"learning_rate": 0.000190007984826491, |
|
"loss": 0.4496, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.9201365187713311, |
|
"grad_norm": 0.06153569370508194, |
|
"learning_rate": 0.00018990379031407124, |
|
"loss": 0.464, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.9228668941979522, |
|
"grad_norm": 0.07417679578065872, |
|
"learning_rate": 0.00018979908423056408, |
|
"loss": 0.4396, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.9255972696245733, |
|
"grad_norm": 0.06745341420173645, |
|
"learning_rate": 0.0001896938671717687, |
|
"loss": 0.4584, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.9283276450511946, |
|
"grad_norm": 0.060262780636548996, |
|
"learning_rate": 0.00018958813973639184, |
|
"loss": 0.4363, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.9310580204778157, |
|
"grad_norm": 0.06427337974309921, |
|
"learning_rate": 0.0001894819025260444, |
|
"loss": 0.4352, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.9337883959044369, |
|
"grad_norm": 0.06150776520371437, |
|
"learning_rate": 0.00018937515614523797, |
|
"loss": 0.4644, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.936518771331058, |
|
"grad_norm": 0.06864424049854279, |
|
"learning_rate": 0.0001892679012013815, |
|
"loss": 0.4608, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.9392491467576792, |
|
"grad_norm": 0.06174071133136749, |
|
"learning_rate": 0.00018916013830477766, |
|
"loss": 0.4402, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.9419795221843004, |
|
"grad_norm": 0.0684589147567749, |
|
"learning_rate": 0.00018905186806861957, |
|
"loss": 0.4569, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.9447098976109215, |
|
"grad_norm": 0.05750627443194389, |
|
"learning_rate": 0.00018894309110898712, |
|
"loss": 0.4522, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.9474402730375426, |
|
"grad_norm": 0.0697883740067482, |
|
"learning_rate": 0.00018883380804484367, |
|
"loss": 0.4594, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.9501706484641638, |
|
"grad_norm": 0.06613462418317795, |
|
"learning_rate": 0.00018872401949803237, |
|
"loss": 0.4459, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.952901023890785, |
|
"grad_norm": 0.06346327811479568, |
|
"learning_rate": 0.00018861372609327263, |
|
"loss": 0.4316, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.9556313993174061, |
|
"grad_norm": 0.06382953375577927, |
|
"learning_rate": 0.00018850292845815672, |
|
"loss": 0.4358, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.9583617747440273, |
|
"grad_norm": 0.07121171057224274, |
|
"learning_rate": 0.0001883916272231459, |
|
"loss": 0.465, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.9610921501706484, |
|
"grad_norm": 0.06311832368373871, |
|
"learning_rate": 0.0001882798230215672, |
|
"loss": 0.4478, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.9638225255972697, |
|
"grad_norm": 0.06858519464731216, |
|
"learning_rate": 0.00018816751648960956, |
|
"loss": 0.4402, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.9665529010238908, |
|
"grad_norm": 0.06063356623053551, |
|
"learning_rate": 0.00018805470826632024, |
|
"loss": 0.4373, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.9692832764505119, |
|
"grad_norm": 0.06550437211990356, |
|
"learning_rate": 0.0001879413989936013, |
|
"loss": 0.4448, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.9720136518771331, |
|
"grad_norm": 0.06248946860432625, |
|
"learning_rate": 0.00018782758931620584, |
|
"loss": 0.4576, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.9747440273037543, |
|
"grad_norm": 0.07067371159791946, |
|
"learning_rate": 0.00018771327988173435, |
|
"loss": 0.4644, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.9774744027303754, |
|
"grad_norm": 0.06225898116827011, |
|
"learning_rate": 0.00018759847134063108, |
|
"loss": 0.4617, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.9802047781569966, |
|
"grad_norm": 0.061437107622623444, |
|
"learning_rate": 0.0001874831643461803, |
|
"loss": 0.4339, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.9829351535836177, |
|
"grad_norm": 0.059149857610464096, |
|
"learning_rate": 0.00018736735955450251, |
|
"loss": 0.4238, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.985665529010239, |
|
"grad_norm": 0.06511219590902328, |
|
"learning_rate": 0.0001872510576245509, |
|
"loss": 0.4394, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.9883959044368601, |
|
"grad_norm": 0.06580841541290283, |
|
"learning_rate": 0.00018713425921810733, |
|
"loss": 0.4218, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.9911262798634812, |
|
"grad_norm": 0.07789267599582672, |
|
"learning_rate": 0.00018701696499977884, |
|
"loss": 0.4524, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.9938566552901024, |
|
"grad_norm": 0.06430528312921524, |
|
"learning_rate": 0.0001868991756369937, |
|
"loss": 0.4503, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.9965870307167235, |
|
"grad_norm": 0.06355779618024826, |
|
"learning_rate": 0.00018678089179999762, |
|
"loss": 0.4556, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.9993174061433447, |
|
"grad_norm": 0.06800378113985062, |
|
"learning_rate": 0.00018666211416184999, |
|
"loss": 0.44, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.9993174061433447, |
|
"eval_loss": 0.4462641775608063, |
|
"eval_runtime": 311.1378, |
|
"eval_samples_per_second": 8.369, |
|
"eval_steps_per_second": 1.048, |
|
"step": 366 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 1464, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.186219049536717e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|