|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.5055526176626124, |
|
"eval_steps": 500, |
|
"global_step": 239, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0021152829190904283, |
|
"grad_norm": 3.663360357284546, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 1.3874, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.004230565838180857, |
|
"grad_norm": 4.501073360443115, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 1.2804, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.006345848757271285, |
|
"grad_norm": 4.990996837615967, |
|
"learning_rate": 3e-06, |
|
"loss": 1.2598, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.008461131676361713, |
|
"grad_norm": 5.776909828186035, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.6224, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.010576414595452142, |
|
"grad_norm": 5.360688209533691, |
|
"learning_rate": 5e-06, |
|
"loss": 1.4513, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.01269169751454257, |
|
"grad_norm": 5.100008964538574, |
|
"learning_rate": 6e-06, |
|
"loss": 1.3938, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.014806980433632998, |
|
"grad_norm": 6.129480838775635, |
|
"learning_rate": 7.000000000000001e-06, |
|
"loss": 1.6399, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.016922263352723427, |
|
"grad_norm": 6.223807334899902, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.5192, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.019037546271813855, |
|
"grad_norm": 5.101447582244873, |
|
"learning_rate": 9e-06, |
|
"loss": 1.4052, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.021152829190904283, |
|
"grad_norm": 5.858772277832031, |
|
"learning_rate": 1e-05, |
|
"loss": 1.4977, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.023268112109994712, |
|
"grad_norm": 5.468352794647217, |
|
"learning_rate": 1.1000000000000001e-05, |
|
"loss": 1.4335, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.02538339502908514, |
|
"grad_norm": 5.489218235015869, |
|
"learning_rate": 1.2e-05, |
|
"loss": 1.4383, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.02749867794817557, |
|
"grad_norm": 5.1987624168396, |
|
"learning_rate": 1.3000000000000001e-05, |
|
"loss": 1.4135, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.029613960867265997, |
|
"grad_norm": 5.416074275970459, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 1.3889, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.03172924378635643, |
|
"grad_norm": 5.2188401222229, |
|
"learning_rate": 1.5e-05, |
|
"loss": 1.4194, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.033844526705446853, |
|
"grad_norm": 5.172801494598389, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 1.5219, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.035959809624537285, |
|
"grad_norm": 4.828940391540527, |
|
"learning_rate": 1.7000000000000003e-05, |
|
"loss": 1.3965, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.03807509254362771, |
|
"grad_norm": 4.481316566467285, |
|
"learning_rate": 1.8e-05, |
|
"loss": 1.3802, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.04019037546271814, |
|
"grad_norm": 4.364404201507568, |
|
"learning_rate": 1.9e-05, |
|
"loss": 1.4343, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.04230565838180857, |
|
"grad_norm": 4.6412177085876465, |
|
"learning_rate": 2e-05, |
|
"loss": 1.3295, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.044420941300899, |
|
"grad_norm": 4.272985458374023, |
|
"learning_rate": 2.1e-05, |
|
"loss": 1.3726, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.046536224219989424, |
|
"grad_norm": 4.246971607208252, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 1.3176, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.048651507139079855, |
|
"grad_norm": 4.056257724761963, |
|
"learning_rate": 2.3000000000000003e-05, |
|
"loss": 1.3644, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.05076679005817028, |
|
"grad_norm": 3.9494340419769287, |
|
"learning_rate": 2.4e-05, |
|
"loss": 1.4087, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.05288207297726071, |
|
"grad_norm": 3.7648792266845703, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.2293, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.05499735589635114, |
|
"grad_norm": 3.988255262374878, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 1.234, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.05711263881544157, |
|
"grad_norm": 3.6617515087127686, |
|
"learning_rate": 2.7000000000000002e-05, |
|
"loss": 1.2011, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.059227921734531994, |
|
"grad_norm": 3.4993789196014404, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 1.1668, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.061343204653622425, |
|
"grad_norm": 4.011035919189453, |
|
"learning_rate": 2.9e-05, |
|
"loss": 1.3237, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.06345848757271286, |
|
"grad_norm": 3.6400904655456543, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2794, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.06557377049180328, |
|
"grad_norm": 3.648898124694824, |
|
"learning_rate": 3.1e-05, |
|
"loss": 1.182, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.06768905341089371, |
|
"grad_norm": 3.6320934295654297, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 1.1123, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.06980433632998413, |
|
"grad_norm": 4.042608737945557, |
|
"learning_rate": 3.3e-05, |
|
"loss": 1.1743, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.07191961924907457, |
|
"grad_norm": 4.352260112762451, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": 1.2997, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.074034902168165, |
|
"grad_norm": 4.048530101776123, |
|
"learning_rate": 3.5e-05, |
|
"loss": 1.1187, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.07615018508725542, |
|
"grad_norm": 3.943918228149414, |
|
"learning_rate": 3.6e-05, |
|
"loss": 1.1385, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.07826546800634585, |
|
"grad_norm": 4.064213752746582, |
|
"learning_rate": 3.7e-05, |
|
"loss": 1.1304, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.08038075092543628, |
|
"grad_norm": 3.7748069763183594, |
|
"learning_rate": 3.8e-05, |
|
"loss": 1.2083, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.08249603384452671, |
|
"grad_norm": 3.924967050552368, |
|
"learning_rate": 3.9000000000000006e-05, |
|
"loss": 1.1601, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.08461131676361713, |
|
"grad_norm": 3.864428758621216, |
|
"learning_rate": 4e-05, |
|
"loss": 1.0968, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.08672659968270756, |
|
"grad_norm": 4.152803897857666, |
|
"learning_rate": 4.1e-05, |
|
"loss": 1.0723, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.088841882601798, |
|
"grad_norm": 4.155374526977539, |
|
"learning_rate": 4.2e-05, |
|
"loss": 1.139, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.09095716552088842, |
|
"grad_norm": 4.044234275817871, |
|
"learning_rate": 4.3e-05, |
|
"loss": 1.1796, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.09307244843997885, |
|
"grad_norm": 3.951974868774414, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 1.2579, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.09518773135906927, |
|
"grad_norm": 4.202303886413574, |
|
"learning_rate": 4.5e-05, |
|
"loss": 1.127, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.09730301427815971, |
|
"grad_norm": 4.821532249450684, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 1.3201, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.09941829719725014, |
|
"grad_norm": 4.339487075805664, |
|
"learning_rate": 4.7e-05, |
|
"loss": 0.9964, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.10153358011634056, |
|
"grad_norm": 4.33770751953125, |
|
"learning_rate": 4.8e-05, |
|
"loss": 1.026, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.10364886303543099, |
|
"grad_norm": 4.407140254974365, |
|
"learning_rate": 4.9e-05, |
|
"loss": 1.1072, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.10576414595452142, |
|
"grad_norm": 5.298942565917969, |
|
"learning_rate": 5e-05, |
|
"loss": 1.0666, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.10787942887361185, |
|
"grad_norm": 2.503105640411377, |
|
"learning_rate": 5.1000000000000006e-05, |
|
"loss": 0.8011, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.10999471179270227, |
|
"grad_norm": 2.9427649974823, |
|
"learning_rate": 5.2000000000000004e-05, |
|
"loss": 1.0133, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.1121099947117927, |
|
"grad_norm": 2.7932252883911133, |
|
"learning_rate": 5.300000000000001e-05, |
|
"loss": 0.945, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.11422527763088314, |
|
"grad_norm": 3.1461985111236572, |
|
"learning_rate": 5.4000000000000005e-05, |
|
"loss": 1.0524, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.11634056054997356, |
|
"grad_norm": 2.978358030319214, |
|
"learning_rate": 5.500000000000001e-05, |
|
"loss": 1.1368, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.11845584346906399, |
|
"grad_norm": 2.626965045928955, |
|
"learning_rate": 5.6000000000000006e-05, |
|
"loss": 1.0135, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.12057112638815441, |
|
"grad_norm": 2.747119903564453, |
|
"learning_rate": 5.6999999999999996e-05, |
|
"loss": 0.9616, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.12268640930724485, |
|
"grad_norm": 2.822309732437134, |
|
"learning_rate": 5.8e-05, |
|
"loss": 0.9357, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.12480169222633528, |
|
"grad_norm": 3.006105899810791, |
|
"learning_rate": 5.9e-05, |
|
"loss": 0.933, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.12691697514542571, |
|
"grad_norm": 3.2313544750213623, |
|
"learning_rate": 6e-05, |
|
"loss": 1.0129, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.12903225806451613, |
|
"grad_norm": 2.9400298595428467, |
|
"learning_rate": 6.1e-05, |
|
"loss": 1.0006, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.13114754098360656, |
|
"grad_norm": 3.390087127685547, |
|
"learning_rate": 6.2e-05, |
|
"loss": 0.9802, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.13326282390269698, |
|
"grad_norm": 3.047139883041382, |
|
"learning_rate": 6.3e-05, |
|
"loss": 1.0711, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.13537810682178741, |
|
"grad_norm": 3.0379374027252197, |
|
"learning_rate": 6.400000000000001e-05, |
|
"loss": 0.9628, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.13749338974087785, |
|
"grad_norm": 2.8749332427978516, |
|
"learning_rate": 6.500000000000001e-05, |
|
"loss": 0.9946, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.13960867265996826, |
|
"grad_norm": 2.915452003479004, |
|
"learning_rate": 6.6e-05, |
|
"loss": 0.9382, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.1417239555790587, |
|
"grad_norm": 3.1922085285186768, |
|
"learning_rate": 6.7e-05, |
|
"loss": 1.0699, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.14383923849814914, |
|
"grad_norm": 3.210163116455078, |
|
"learning_rate": 6.800000000000001e-05, |
|
"loss": 1.0313, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.14595452141723955, |
|
"grad_norm": 3.5844082832336426, |
|
"learning_rate": 6.9e-05, |
|
"loss": 1.1133, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.14806980433633, |
|
"grad_norm": 3.134580612182617, |
|
"learning_rate": 7e-05, |
|
"loss": 1.0114, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1501850872554204, |
|
"grad_norm": 2.9446849822998047, |
|
"learning_rate": 7.1e-05, |
|
"loss": 0.949, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.15230037017451084, |
|
"grad_norm": 3.1270101070404053, |
|
"learning_rate": 7.2e-05, |
|
"loss": 0.9519, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.15441565309360128, |
|
"grad_norm": 3.146308422088623, |
|
"learning_rate": 7.3e-05, |
|
"loss": 1.0239, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.1565309360126917, |
|
"grad_norm": 3.3636350631713867, |
|
"learning_rate": 7.4e-05, |
|
"loss": 1.0306, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.15864621893178213, |
|
"grad_norm": 3.30778431892395, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 1.1119, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.16076150185087257, |
|
"grad_norm": 3.4229915142059326, |
|
"learning_rate": 7.6e-05, |
|
"loss": 0.9261, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.16287678476996298, |
|
"grad_norm": 3.0347306728363037, |
|
"learning_rate": 7.7e-05, |
|
"loss": 0.9614, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.16499206768905342, |
|
"grad_norm": 3.31915283203125, |
|
"learning_rate": 7.800000000000001e-05, |
|
"loss": 1.0835, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.16710735060814383, |
|
"grad_norm": 3.4393112659454346, |
|
"learning_rate": 7.900000000000001e-05, |
|
"loss": 0.9913, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.16922263352723427, |
|
"grad_norm": 3.4449856281280518, |
|
"learning_rate": 8e-05, |
|
"loss": 0.8581, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.1713379164463247, |
|
"grad_norm": 3.2748067378997803, |
|
"learning_rate": 8.1e-05, |
|
"loss": 1.0115, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.17345319936541512, |
|
"grad_norm": 3.2635154724121094, |
|
"learning_rate": 8.2e-05, |
|
"loss": 0.9795, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.17556848228450556, |
|
"grad_norm": 3.0997257232666016, |
|
"learning_rate": 8.3e-05, |
|
"loss": 0.9917, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.177683765203596, |
|
"grad_norm": 3.5653927326202393, |
|
"learning_rate": 8.4e-05, |
|
"loss": 1.0756, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.1797990481226864, |
|
"grad_norm": 3.5017178058624268, |
|
"learning_rate": 8.5e-05, |
|
"loss": 1.0704, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.18191433104177684, |
|
"grad_norm": 3.7366182804107666, |
|
"learning_rate": 8.6e-05, |
|
"loss": 1.0626, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.18402961396086726, |
|
"grad_norm": 3.33809494972229, |
|
"learning_rate": 8.7e-05, |
|
"loss": 0.9562, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.1861448968799577, |
|
"grad_norm": 3.7696216106414795, |
|
"learning_rate": 8.800000000000001e-05, |
|
"loss": 0.9564, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.18826017979904813, |
|
"grad_norm": 3.3780527114868164, |
|
"learning_rate": 8.900000000000001e-05, |
|
"loss": 0.9696, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.19037546271813854, |
|
"grad_norm": 3.411578893661499, |
|
"learning_rate": 9e-05, |
|
"loss": 1.0817, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.19249074563722898, |
|
"grad_norm": 3.6279563903808594, |
|
"learning_rate": 9.1e-05, |
|
"loss": 0.9145, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.19460602855631942, |
|
"grad_norm": 3.7208714485168457, |
|
"learning_rate": 9.200000000000001e-05, |
|
"loss": 1.0407, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.19672131147540983, |
|
"grad_norm": 3.305312156677246, |
|
"learning_rate": 9.300000000000001e-05, |
|
"loss": 0.9349, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.19883659439450027, |
|
"grad_norm": 3.571962356567383, |
|
"learning_rate": 9.4e-05, |
|
"loss": 1.0338, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.20095187731359068, |
|
"grad_norm": 3.485724687576294, |
|
"learning_rate": 9.5e-05, |
|
"loss": 0.9924, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.20306716023268112, |
|
"grad_norm": 3.522717237472534, |
|
"learning_rate": 9.6e-05, |
|
"loss": 0.9256, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.20518244315177156, |
|
"grad_norm": 3.6267905235290527, |
|
"learning_rate": 9.7e-05, |
|
"loss": 0.8813, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.20729772607086197, |
|
"grad_norm": 3.9175007343292236, |
|
"learning_rate": 9.8e-05, |
|
"loss": 1.054, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.2094130089899524, |
|
"grad_norm": 4.084409713745117, |
|
"learning_rate": 9.900000000000001e-05, |
|
"loss": 0.9527, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.21152829190904285, |
|
"grad_norm": 4.255803108215332, |
|
"learning_rate": 0.0001, |
|
"loss": 0.976, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.21364357482813326, |
|
"grad_norm": 2.8322207927703857, |
|
"learning_rate": 9.99982170002055e-05, |
|
"loss": 0.8718, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.2157588577472237, |
|
"grad_norm": 3.1382946968078613, |
|
"learning_rate": 9.99928681279855e-05, |
|
"loss": 0.959, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.2178741406663141, |
|
"grad_norm": 2.58886456489563, |
|
"learning_rate": 9.998395376482152e-05, |
|
"loss": 0.8565, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.21998942358540455, |
|
"grad_norm": 2.7843730449676514, |
|
"learning_rate": 9.99714745464859e-05, |
|
"loss": 0.9355, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.222104706504495, |
|
"grad_norm": 2.447822332382202, |
|
"learning_rate": 9.995543136299636e-05, |
|
"loss": 0.838, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.2242199894235854, |
|
"grad_norm": 2.584947347640991, |
|
"learning_rate": 9.993582535855263e-05, |
|
"loss": 0.9728, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.22633527234267584, |
|
"grad_norm": 2.7623209953308105, |
|
"learning_rate": 9.991265793145479e-05, |
|
"loss": 0.8943, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.22845055526176627, |
|
"grad_norm": 2.9683995246887207, |
|
"learning_rate": 9.988593073400354e-05, |
|
"loss": 0.8888, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.23056583818085669, |
|
"grad_norm": 2.9614081382751465, |
|
"learning_rate": 9.985564567238237e-05, |
|
"loss": 0.9477, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.23268112109994712, |
|
"grad_norm": 2.800546646118164, |
|
"learning_rate": 9.982180490652165e-05, |
|
"loss": 0.9267, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.23479640401903754, |
|
"grad_norm": 2.7987921237945557, |
|
"learning_rate": 9.978441084994453e-05, |
|
"loss": 0.9653, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.23691168693812797, |
|
"grad_norm": 2.9128034114837646, |
|
"learning_rate": 9.974346616959476e-05, |
|
"loss": 0.9898, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.2390269698572184, |
|
"grad_norm": 2.837714433670044, |
|
"learning_rate": 9.969897378564666e-05, |
|
"loss": 0.984, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.24114225277630882, |
|
"grad_norm": 3.1094441413879395, |
|
"learning_rate": 9.965093687129668e-05, |
|
"loss": 0.8232, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.24325753569539926, |
|
"grad_norm": 2.8722965717315674, |
|
"learning_rate": 9.959935885253716e-05, |
|
"loss": 0.9426, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.2453728186144897, |
|
"grad_norm": 2.8371827602386475, |
|
"learning_rate": 9.954424340791196e-05, |
|
"loss": 0.9326, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.2474881015335801, |
|
"grad_norm": 2.9134371280670166, |
|
"learning_rate": 9.948559446825412e-05, |
|
"loss": 0.9904, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.24960338445267055, |
|
"grad_norm": 2.8855326175689697, |
|
"learning_rate": 9.942341621640558e-05, |
|
"loss": 0.8379, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.25171866737176096, |
|
"grad_norm": 2.910688877105713, |
|
"learning_rate": 9.935771308691871e-05, |
|
"loss": 0.9578, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.25383395029085143, |
|
"grad_norm": 3.0281407833099365, |
|
"learning_rate": 9.928848976574019e-05, |
|
"loss": 0.8464, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.25594923320994184, |
|
"grad_norm": 3.0733401775360107, |
|
"learning_rate": 9.921575118987672e-05, |
|
"loss": 0.9664, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.25806451612903225, |
|
"grad_norm": 2.8841378688812256, |
|
"learning_rate": 9.913950254704291e-05, |
|
"loss": 1.0148, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.26017979904812266, |
|
"grad_norm": 3.1016345024108887, |
|
"learning_rate": 9.905974927529135e-05, |
|
"loss": 0.9315, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.26229508196721313, |
|
"grad_norm": 3.0234508514404297, |
|
"learning_rate": 9.897649706262473e-05, |
|
"loss": 0.9403, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.26441036488630354, |
|
"grad_norm": 2.8850417137145996, |
|
"learning_rate": 9.888975184659018e-05, |
|
"loss": 0.9036, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.26652564780539395, |
|
"grad_norm": 2.760040044784546, |
|
"learning_rate": 9.879951981385578e-05, |
|
"loss": 0.8731, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.2686409307244844, |
|
"grad_norm": 2.6963107585906982, |
|
"learning_rate": 9.870580739976935e-05, |
|
"loss": 0.8391, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.27075621364357483, |
|
"grad_norm": 3.0519609451293945, |
|
"learning_rate": 9.860862128789953e-05, |
|
"loss": 0.8089, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.27287149656266524, |
|
"grad_norm": 2.85986065864563, |
|
"learning_rate": 9.8507968409559e-05, |
|
"loss": 0.9419, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.2749867794817557, |
|
"grad_norm": 3.2540230751037598, |
|
"learning_rate": 9.840385594331022e-05, |
|
"loss": 0.9576, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.2771020624008461, |
|
"grad_norm": 3.103649616241455, |
|
"learning_rate": 9.829629131445342e-05, |
|
"loss": 0.9163, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.2792173453199365, |
|
"grad_norm": 2.9602162837982178, |
|
"learning_rate": 9.818528219449705e-05, |
|
"loss": 0.8656, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.281332628239027, |
|
"grad_norm": 2.9030895233154297, |
|
"learning_rate": 9.807083650061063e-05, |
|
"loss": 0.8838, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.2834479111581174, |
|
"grad_norm": 3.06990909576416, |
|
"learning_rate": 9.795296239506012e-05, |
|
"loss": 0.9112, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.2855631940772078, |
|
"grad_norm": 3.180403470993042, |
|
"learning_rate": 9.783166828462574e-05, |
|
"loss": 0.9657, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.2876784769962983, |
|
"grad_norm": 3.3117172718048096, |
|
"learning_rate": 9.770696282000244e-05, |
|
"loss": 0.9669, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.2897937599153887, |
|
"grad_norm": 3.1493141651153564, |
|
"learning_rate": 9.757885489518297e-05, |
|
"loss": 1.0193, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.2919090428344791, |
|
"grad_norm": 3.5290863513946533, |
|
"learning_rate": 9.744735364682346e-05, |
|
"loss": 0.952, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.2940243257535695, |
|
"grad_norm": 2.9906020164489746, |
|
"learning_rate": 9.731246845359186e-05, |
|
"loss": 0.8628, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.29613960867266, |
|
"grad_norm": 3.1866257190704346, |
|
"learning_rate": 9.717420893549902e-05, |
|
"loss": 0.9335, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2982548915917504, |
|
"grad_norm": 3.139331817626953, |
|
"learning_rate": 9.703258495321266e-05, |
|
"loss": 0.9351, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.3003701745108408, |
|
"grad_norm": 3.252739667892456, |
|
"learning_rate": 9.688760660735402e-05, |
|
"loss": 0.9207, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.30248545742993127, |
|
"grad_norm": 3.4457814693450928, |
|
"learning_rate": 9.673928423777756e-05, |
|
"loss": 1.1494, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.3046007403490217, |
|
"grad_norm": 3.2288053035736084, |
|
"learning_rate": 9.658762842283343e-05, |
|
"loss": 0.9069, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.3067160232681121, |
|
"grad_norm": 2.987396240234375, |
|
"learning_rate": 9.643264997861312e-05, |
|
"loss": 0.9142, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.30883130618720256, |
|
"grad_norm": 3.496523141860962, |
|
"learning_rate": 9.627435995817799e-05, |
|
"loss": 0.9341, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.31094658910629297, |
|
"grad_norm": 3.4983298778533936, |
|
"learning_rate": 9.611276965077099e-05, |
|
"loss": 0.8686, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.3130618720253834, |
|
"grad_norm": 3.702148199081421, |
|
"learning_rate": 9.594789058101153e-05, |
|
"loss": 1.0261, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.31517715494447385, |
|
"grad_norm": 4.358850002288818, |
|
"learning_rate": 9.577973450807351e-05, |
|
"loss": 1.0421, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.31729243786356426, |
|
"grad_norm": 4.173257350921631, |
|
"learning_rate": 9.560831342484667e-05, |
|
"loss": 0.915, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.31940772078265467, |
|
"grad_norm": 2.3479433059692383, |
|
"learning_rate": 9.543363955708125e-05, |
|
"loss": 0.7801, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.32152300370174514, |
|
"grad_norm": 2.8180437088012695, |
|
"learning_rate": 9.525572536251607e-05, |
|
"loss": 0.8987, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.32363828662083555, |
|
"grad_norm": 2.8835885524749756, |
|
"learning_rate": 9.507458352999001e-05, |
|
"loss": 0.9624, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.32575356953992596, |
|
"grad_norm": 2.63946270942688, |
|
"learning_rate": 9.489022697853709e-05, |
|
"loss": 0.9057, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.32786885245901637, |
|
"grad_norm": 2.484527349472046, |
|
"learning_rate": 9.470266885646503e-05, |
|
"loss": 0.8801, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.32998413537810684, |
|
"grad_norm": 2.4756839275360107, |
|
"learning_rate": 9.451192254041758e-05, |
|
"loss": 0.8307, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.33209941829719725, |
|
"grad_norm": 2.497173547744751, |
|
"learning_rate": 9.431800163442041e-05, |
|
"loss": 0.7906, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.33421470121628766, |
|
"grad_norm": 2.7058324813842773, |
|
"learning_rate": 9.412091996891096e-05, |
|
"loss": 0.9039, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.3363299841353781, |
|
"grad_norm": 2.7305657863616943, |
|
"learning_rate": 9.392069159975199e-05, |
|
"loss": 0.9251, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.33844526705446853, |
|
"grad_norm": 2.722649574279785, |
|
"learning_rate": 9.371733080722911e-05, |
|
"loss": 0.9382, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.34056054997355895, |
|
"grad_norm": 2.5822291374206543, |
|
"learning_rate": 9.351085209503241e-05, |
|
"loss": 0.9322, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.3426758328926494, |
|
"grad_norm": 2.520709276199341, |
|
"learning_rate": 9.330127018922194e-05, |
|
"loss": 0.9386, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.3447911158117398, |
|
"grad_norm": 2.785912275314331, |
|
"learning_rate": 9.308860003717748e-05, |
|
"loss": 0.853, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.34690639873083023, |
|
"grad_norm": 2.759871482849121, |
|
"learning_rate": 9.287285680653254e-05, |
|
"loss": 0.8206, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.3490216816499207, |
|
"grad_norm": 2.5678517818450928, |
|
"learning_rate": 9.265405588409257e-05, |
|
"loss": 0.8493, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.3511369645690111, |
|
"grad_norm": 2.4928886890411377, |
|
"learning_rate": 9.243221287473756e-05, |
|
"loss": 0.7925, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.3532522474881015, |
|
"grad_norm": 2.80059814453125, |
|
"learning_rate": 9.220734360030907e-05, |
|
"loss": 0.9222, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.355367530407192, |
|
"grad_norm": 2.5495877265930176, |
|
"learning_rate": 9.197946409848194e-05, |
|
"loss": 0.9326, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.3574828133262824, |
|
"grad_norm": 2.6717426776885986, |
|
"learning_rate": 9.174859062162038e-05, |
|
"loss": 0.8938, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.3595980962453728, |
|
"grad_norm": 2.851301908493042, |
|
"learning_rate": 9.151473963561883e-05, |
|
"loss": 0.864, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.3617133791644632, |
|
"grad_norm": 2.684170722961426, |
|
"learning_rate": 9.127792781872769e-05, |
|
"loss": 0.7993, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.3638286620835537, |
|
"grad_norm": 2.6831891536712646, |
|
"learning_rate": 9.103817206036382e-05, |
|
"loss": 0.9036, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.3659439450026441, |
|
"grad_norm": 2.7455031871795654, |
|
"learning_rate": 9.079548945990593e-05, |
|
"loss": 0.8815, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.3680592279217345, |
|
"grad_norm": 2.7768361568450928, |
|
"learning_rate": 9.054989732547506e-05, |
|
"loss": 0.8734, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.370174510840825, |
|
"grad_norm": 3.000823497772217, |
|
"learning_rate": 9.030141317270026e-05, |
|
"loss": 0.901, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.3722897937599154, |
|
"grad_norm": 2.698535203933716, |
|
"learning_rate": 9.005005472346924e-05, |
|
"loss": 0.9426, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.3744050766790058, |
|
"grad_norm": 2.9230258464813232, |
|
"learning_rate": 8.979583990466454e-05, |
|
"loss": 0.9412, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.37652035959809627, |
|
"grad_norm": 2.8420345783233643, |
|
"learning_rate": 8.953878684688493e-05, |
|
"loss": 0.9279, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.3786356425171867, |
|
"grad_norm": 2.883340358734131, |
|
"learning_rate": 8.927891388315242e-05, |
|
"loss": 0.932, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.3807509254362771, |
|
"grad_norm": 2.778456926345825, |
|
"learning_rate": 8.90162395476046e-05, |
|
"loss": 0.907, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.38286620835536755, |
|
"grad_norm": 2.811483383178711, |
|
"learning_rate": 8.875078257417295e-05, |
|
"loss": 0.8524, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.38498149127445797, |
|
"grad_norm": 2.951821804046631, |
|
"learning_rate": 8.848256189524662e-05, |
|
"loss": 0.9459, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.3870967741935484, |
|
"grad_norm": 2.9781339168548584, |
|
"learning_rate": 8.821159664032223e-05, |
|
"loss": 0.9516, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.38921205711263884, |
|
"grad_norm": 2.985658645629883, |
|
"learning_rate": 8.793790613463955e-05, |
|
"loss": 0.9337, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.39132734003172925, |
|
"grad_norm": 2.818643808364868, |
|
"learning_rate": 8.766150989780318e-05, |
|
"loss": 0.8606, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.39344262295081966, |
|
"grad_norm": 3.2023520469665527, |
|
"learning_rate": 8.738242764239046e-05, |
|
"loss": 0.9403, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.3955579058699101, |
|
"grad_norm": 3.15983510017395, |
|
"learning_rate": 8.710067927254555e-05, |
|
"loss": 0.878, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.39767318878900054, |
|
"grad_norm": 3.0788826942443848, |
|
"learning_rate": 8.681628488255986e-05, |
|
"loss": 0.8531, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.39978847170809095, |
|
"grad_norm": 3.163588285446167, |
|
"learning_rate": 8.652926475543899e-05, |
|
"loss": 1.0054, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.40190375462718136, |
|
"grad_norm": 3.0600147247314453, |
|
"learning_rate": 8.6239639361456e-05, |
|
"loss": 0.9252, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.40401903754627183, |
|
"grad_norm": 3.1859021186828613, |
|
"learning_rate": 8.594742935669165e-05, |
|
"loss": 0.9387, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.40613432046536224, |
|
"grad_norm": 3.0455939769744873, |
|
"learning_rate": 8.565265558156101e-05, |
|
"loss": 0.8999, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.40824960338445265, |
|
"grad_norm": 3.108290672302246, |
|
"learning_rate": 8.535533905932738e-05, |
|
"loss": 0.81, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.4103648863035431, |
|
"grad_norm": 3.2528982162475586, |
|
"learning_rate": 8.505550099460265e-05, |
|
"loss": 0.8264, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.41248016922263353, |
|
"grad_norm": 3.38067889213562, |
|
"learning_rate": 8.475316277183509e-05, |
|
"loss": 0.9919, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.41459545214172394, |
|
"grad_norm": 3.193894147872925, |
|
"learning_rate": 8.444834595378434e-05, |
|
"loss": 0.9069, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.4167107350608144, |
|
"grad_norm": 3.862703323364258, |
|
"learning_rate": 8.414107227998329e-05, |
|
"loss": 0.8882, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.4188260179799048, |
|
"grad_norm": 3.4907665252685547, |
|
"learning_rate": 8.383136366518788e-05, |
|
"loss": 1.0051, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.42094130089899523, |
|
"grad_norm": 3.8930423259735107, |
|
"learning_rate": 8.351924219781393e-05, |
|
"loss": 0.8381, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.4230565838180857, |
|
"grad_norm": 4.245903491973877, |
|
"learning_rate": 8.320473013836196e-05, |
|
"loss": 0.9331, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4251718667371761, |
|
"grad_norm": 2.643822431564331, |
|
"learning_rate": 8.288784991782946e-05, |
|
"loss": 0.7513, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.4272871496562665, |
|
"grad_norm": 2.622617244720459, |
|
"learning_rate": 8.256862413611113e-05, |
|
"loss": 0.7931, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.42940243257535693, |
|
"grad_norm": 3.409309148788452, |
|
"learning_rate": 8.22470755603871e-05, |
|
"loss": 1.2241, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.4315177154944474, |
|
"grad_norm": 2.4329824447631836, |
|
"learning_rate": 8.192322712349917e-05, |
|
"loss": 0.8528, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.4336329984135378, |
|
"grad_norm": 2.328763484954834, |
|
"learning_rate": 8.15971019223152e-05, |
|
"loss": 0.8701, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.4357482813326282, |
|
"grad_norm": 2.2439918518066406, |
|
"learning_rate": 8.126872321608184e-05, |
|
"loss": 0.8036, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.4378635642517187, |
|
"grad_norm": 2.3202691078186035, |
|
"learning_rate": 8.093811442476573e-05, |
|
"loss": 0.8685, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.4399788471708091, |
|
"grad_norm": 2.3466012477874756, |
|
"learning_rate": 8.060529912738315e-05, |
|
"loss": 0.8369, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.4420941300898995, |
|
"grad_norm": 2.5325000286102295, |
|
"learning_rate": 8.027030106031836e-05, |
|
"loss": 0.943, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.44420941300899, |
|
"grad_norm": 2.6998066902160645, |
|
"learning_rate": 7.993314411563075e-05, |
|
"loss": 0.9273, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.4463246959280804, |
|
"grad_norm": 2.5904581546783447, |
|
"learning_rate": 7.959385233935085e-05, |
|
"loss": 0.9509, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.4484399788471708, |
|
"grad_norm": 2.5315418243408203, |
|
"learning_rate": 7.925244992976538e-05, |
|
"loss": 0.8049, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.45055526176626126, |
|
"grad_norm": 2.661184310913086, |
|
"learning_rate": 7.890896123569136e-05, |
|
"loss": 0.9386, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.4526705446853517, |
|
"grad_norm": 2.444127321243286, |
|
"learning_rate": 7.856341075473962e-05, |
|
"loss": 0.8698, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.4547858276044421, |
|
"grad_norm": 2.5407636165618896, |
|
"learning_rate": 7.821582313156764e-05, |
|
"loss": 0.8877, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.45690111052353255, |
|
"grad_norm": 2.7174148559570312, |
|
"learning_rate": 7.786622315612183e-05, |
|
"loss": 0.8976, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.45901639344262296, |
|
"grad_norm": 2.5770790576934814, |
|
"learning_rate": 7.751463576186957e-05, |
|
"loss": 0.8609, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.46113167636171337, |
|
"grad_norm": 2.5155019760131836, |
|
"learning_rate": 7.716108602402094e-05, |
|
"loss": 0.8183, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.4632469592808038, |
|
"grad_norm": 2.812934637069702, |
|
"learning_rate": 7.680559915774034e-05, |
|
"loss": 0.9306, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.46536224219989425, |
|
"grad_norm": 2.5536584854125977, |
|
"learning_rate": 7.644820051634812e-05, |
|
"loss": 0.8586, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.46747752511898466, |
|
"grad_norm": 2.4447860717773438, |
|
"learning_rate": 7.608891558951249e-05, |
|
"loss": 0.8023, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.46959280803807507, |
|
"grad_norm": 2.5778310298919678, |
|
"learning_rate": 7.572777000143145e-05, |
|
"loss": 0.8771, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.47170809095716554, |
|
"grad_norm": 2.6907410621643066, |
|
"learning_rate": 7.536478950900538e-05, |
|
"loss": 0.8514, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.47382337387625595, |
|
"grad_norm": 2.682229995727539, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 0.8711, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.47593865679534636, |
|
"grad_norm": 2.6666316986083984, |
|
"learning_rate": 7.463342749120014e-05, |
|
"loss": 0.8428, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.4780539397144368, |
|
"grad_norm": 2.8513665199279785, |
|
"learning_rate": 7.426509812655406e-05, |
|
"loss": 0.877, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.48016922263352724, |
|
"grad_norm": 2.9401919841766357, |
|
"learning_rate": 7.389503817530905e-05, |
|
"loss": 0.9199, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.48228450555261765, |
|
"grad_norm": 2.7450366020202637, |
|
"learning_rate": 7.35232740301378e-05, |
|
"loss": 0.8136, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.4843997884717081, |
|
"grad_norm": 2.990210771560669, |
|
"learning_rate": 7.314983220525604e-05, |
|
"loss": 0.9458, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.4865150713907985, |
|
"grad_norm": 2.915475606918335, |
|
"learning_rate": 7.27747393345317e-05, |
|
"loss": 0.9262, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.48863035430988894, |
|
"grad_norm": 2.913248062133789, |
|
"learning_rate": 7.239802216958523e-05, |
|
"loss": 0.8632, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.4907456372289794, |
|
"grad_norm": 3.139481544494629, |
|
"learning_rate": 7.201970757788172e-05, |
|
"loss": 0.9177, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.4928609201480698, |
|
"grad_norm": 2.9794747829437256, |
|
"learning_rate": 7.163982254081475e-05, |
|
"loss": 0.8903, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.4949762030671602, |
|
"grad_norm": 2.8736488819122314, |
|
"learning_rate": 7.125839415178204e-05, |
|
"loss": 0.8661, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.49709148598625064, |
|
"grad_norm": 3.0208191871643066, |
|
"learning_rate": 7.087544961425317e-05, |
|
"loss": 0.8658, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.4992067689053411, |
|
"grad_norm": 2.7812483310699463, |
|
"learning_rate": 7.049101623982937e-05, |
|
"loss": 0.8354, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.5013220518244316, |
|
"grad_norm": 3.3551738262176514, |
|
"learning_rate": 7.010512144629579e-05, |
|
"loss": 0.9178, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.5034373347435219, |
|
"grad_norm": 2.927945375442505, |
|
"learning_rate": 6.971779275566593e-05, |
|
"loss": 0.9225, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.5055526176626124, |
|
"grad_norm": 3.2569127082824707, |
|
"learning_rate": 6.93290577922188e-05, |
|
"loss": 0.9018, |
|
"step": 239 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 472, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 239, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.159932941697024e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|