|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 995, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0020100502512562816, |
|
"grad_norm": 19.729604721069336, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 6.6152, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.004020100502512563, |
|
"grad_norm": 16.32933807373047, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 6.684, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.006030150753768844, |
|
"grad_norm": 9.964282989501953, |
|
"learning_rate": 6e-06, |
|
"loss": 6.4803, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.008040201005025126, |
|
"grad_norm": 9.893184661865234, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 5.9697, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.010050251256281407, |
|
"grad_norm": 7.443181037902832, |
|
"learning_rate": 1e-05, |
|
"loss": 5.7487, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.012060301507537688, |
|
"grad_norm": 5.485438346862793, |
|
"learning_rate": 1.2e-05, |
|
"loss": 5.6723, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.01407035175879397, |
|
"grad_norm": 4.975802421569824, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 5.4903, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.016080402010050253, |
|
"grad_norm": 5.055726051330566, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 5.2516, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.018090452261306532, |
|
"grad_norm": 4.720687389373779, |
|
"learning_rate": 1.8e-05, |
|
"loss": 5.0191, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.020100502512562814, |
|
"grad_norm": 3.445399761199951, |
|
"learning_rate": 2e-05, |
|
"loss": 4.8732, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.022110552763819097, |
|
"grad_norm": 3.533928155899048, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 5.1519, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.024120603015075376, |
|
"grad_norm": 3.4936132431030273, |
|
"learning_rate": 2.4e-05, |
|
"loss": 4.9552, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.02613065326633166, |
|
"grad_norm": 3.037400722503662, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 4.622, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.02814070351758794, |
|
"grad_norm": 4.790655136108398, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 4.5671, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.03015075376884422, |
|
"grad_norm": 6.313861846923828, |
|
"learning_rate": 3e-05, |
|
"loss": 4.3285, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.032160804020100506, |
|
"grad_norm": 2.662900447845459, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 4.2636, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.034170854271356785, |
|
"grad_norm": 2.5860118865966797, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": 4.3172, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.036180904522613064, |
|
"grad_norm": 2.960880994796753, |
|
"learning_rate": 3.6e-05, |
|
"loss": 3.9687, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.03819095477386935, |
|
"grad_norm": 3.4453978538513184, |
|
"learning_rate": 3.8e-05, |
|
"loss": 3.8505, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.04020100502512563, |
|
"grad_norm": 2.6721041202545166, |
|
"learning_rate": 4e-05, |
|
"loss": 3.971, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.04221105527638191, |
|
"grad_norm": 2.6484851837158203, |
|
"learning_rate": 4.2e-05, |
|
"loss": 3.8536, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.044221105527638194, |
|
"grad_norm": 6.354692459106445, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 3.8466, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.04623115577889447, |
|
"grad_norm": 2.6469643115997314, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 3.5942, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.04824120603015075, |
|
"grad_norm": 2.7454721927642822, |
|
"learning_rate": 4.8e-05, |
|
"loss": 3.526, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.05025125628140704, |
|
"grad_norm": 2.445122480392456, |
|
"learning_rate": 5e-05, |
|
"loss": 3.5655, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.05226130653266332, |
|
"grad_norm": 2.458037853240967, |
|
"learning_rate": 5.2000000000000004e-05, |
|
"loss": 3.3741, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.054271356783919596, |
|
"grad_norm": 2.469712972640991, |
|
"learning_rate": 5.4000000000000005e-05, |
|
"loss": 3.4174, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.05628140703517588, |
|
"grad_norm": 2.475627899169922, |
|
"learning_rate": 5.6000000000000006e-05, |
|
"loss": 3.3984, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.05829145728643216, |
|
"grad_norm": 2.6965248584747314, |
|
"learning_rate": 5.8e-05, |
|
"loss": 3.5654, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.06030150753768844, |
|
"grad_norm": 2.3575079441070557, |
|
"learning_rate": 6e-05, |
|
"loss": 3.2454, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.062311557788944726, |
|
"grad_norm": 2.371737480163574, |
|
"learning_rate": 6.2e-05, |
|
"loss": 3.361, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.06432160804020101, |
|
"grad_norm": 2.144615650177002, |
|
"learning_rate": 6.400000000000001e-05, |
|
"loss": 3.2241, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.06633165829145729, |
|
"grad_norm": 1.9614999294281006, |
|
"learning_rate": 6.6e-05, |
|
"loss": 3.1206, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.06834170854271357, |
|
"grad_norm": 2.0626800060272217, |
|
"learning_rate": 6.800000000000001e-05, |
|
"loss": 3.3899, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.07035175879396985, |
|
"grad_norm": 2.669525623321533, |
|
"learning_rate": 7e-05, |
|
"loss": 3.2405, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.07236180904522613, |
|
"grad_norm": 2.620374917984009, |
|
"learning_rate": 7.2e-05, |
|
"loss": 3.3299, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.0743718592964824, |
|
"grad_norm": 2.2318763732910156, |
|
"learning_rate": 7.4e-05, |
|
"loss": 2.9798, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.0763819095477387, |
|
"grad_norm": 2.28938889503479, |
|
"learning_rate": 7.6e-05, |
|
"loss": 3.2799, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.07839195979899498, |
|
"grad_norm": 2.176002264022827, |
|
"learning_rate": 7.800000000000001e-05, |
|
"loss": 3.2466, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.08040201005025126, |
|
"grad_norm": 2.2421767711639404, |
|
"learning_rate": 8e-05, |
|
"loss": 3.0629, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.08241206030150754, |
|
"grad_norm": 2.7618038654327393, |
|
"learning_rate": 8.2e-05, |
|
"loss": 3.2719, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.08442211055276382, |
|
"grad_norm": 2.1031386852264404, |
|
"learning_rate": 8.4e-05, |
|
"loss": 3.0094, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.0864321608040201, |
|
"grad_norm": 2.1169002056121826, |
|
"learning_rate": 8.6e-05, |
|
"loss": 3.021, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.08844221105527639, |
|
"grad_norm": 2.4154305458068848, |
|
"learning_rate": 8.800000000000001e-05, |
|
"loss": 3.0685, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.09045226130653267, |
|
"grad_norm": 2.4994680881500244, |
|
"learning_rate": 9e-05, |
|
"loss": 2.8943, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.09246231155778895, |
|
"grad_norm": 3.4030494689941406, |
|
"learning_rate": 9.200000000000001e-05, |
|
"loss": 2.8389, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.09447236180904522, |
|
"grad_norm": 5.54694128036499, |
|
"learning_rate": 9.4e-05, |
|
"loss": 3.0157, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.0964824120603015, |
|
"grad_norm": 2.363124370574951, |
|
"learning_rate": 9.6e-05, |
|
"loss": 2.811, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.09849246231155778, |
|
"grad_norm": 2.0617117881774902, |
|
"learning_rate": 9.8e-05, |
|
"loss": 2.8819, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.10050251256281408, |
|
"grad_norm": 2.354085922241211, |
|
"learning_rate": 0.0001, |
|
"loss": 3.0962, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.10251256281407035, |
|
"grad_norm": 2.3361897468566895, |
|
"learning_rate": 9.999972370327507e-05, |
|
"loss": 3.0454, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.10452261306532663, |
|
"grad_norm": 2.2245845794677734, |
|
"learning_rate": 9.999889481615387e-05, |
|
"loss": 3.1027, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.10653266331658291, |
|
"grad_norm": 2.069746494293213, |
|
"learning_rate": 9.999751334779716e-05, |
|
"loss": 2.915, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.10854271356783919, |
|
"grad_norm": 2.3541088104248047, |
|
"learning_rate": 9.999557931347273e-05, |
|
"loss": 2.8328, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.11055276381909548, |
|
"grad_norm": 2.2207984924316406, |
|
"learning_rate": 9.999309273455528e-05, |
|
"loss": 3.0153, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.11256281407035176, |
|
"grad_norm": 2.1303670406341553, |
|
"learning_rate": 9.999005363852618e-05, |
|
"loss": 2.8649, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.11457286432160804, |
|
"grad_norm": 2.084247589111328, |
|
"learning_rate": 9.998646205897309e-05, |
|
"loss": 2.9326, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.11658291457286432, |
|
"grad_norm": 2.121018171310425, |
|
"learning_rate": 9.998231803558968e-05, |
|
"loss": 2.6179, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.1185929648241206, |
|
"grad_norm": 2.2490546703338623, |
|
"learning_rate": 9.997762161417517e-05, |
|
"loss": 3.0462, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.12060301507537688, |
|
"grad_norm": 1.9550148248672485, |
|
"learning_rate": 9.997237284663379e-05, |
|
"loss": 2.8662, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.12261306532663317, |
|
"grad_norm": 2.2603437900543213, |
|
"learning_rate": 9.996657179097421e-05, |
|
"loss": 2.7448, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.12462311557788945, |
|
"grad_norm": 1.897772192955017, |
|
"learning_rate": 9.996021851130897e-05, |
|
"loss": 2.8954, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.12663316582914572, |
|
"grad_norm": 2.1712026596069336, |
|
"learning_rate": 9.995331307785365e-05, |
|
"loss": 2.9001, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.12864321608040202, |
|
"grad_norm": 1.9912605285644531, |
|
"learning_rate": 9.994585556692624e-05, |
|
"loss": 3.0064, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.1306532663316583, |
|
"grad_norm": 2.094640016555786, |
|
"learning_rate": 9.993784606094612e-05, |
|
"loss": 2.6635, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.13266331658291458, |
|
"grad_norm": 2.1064651012420654, |
|
"learning_rate": 9.992928464843334e-05, |
|
"loss": 2.9224, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.13467336683417086, |
|
"grad_norm": 1.9782873392105103, |
|
"learning_rate": 9.992017142400751e-05, |
|
"loss": 2.8202, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.13668341708542714, |
|
"grad_norm": 2.189976692199707, |
|
"learning_rate": 9.991050648838675e-05, |
|
"loss": 2.7919, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.13869346733668342, |
|
"grad_norm": 2.0882322788238525, |
|
"learning_rate": 9.990028994838673e-05, |
|
"loss": 2.868, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.1407035175879397, |
|
"grad_norm": 1.8483355045318604, |
|
"learning_rate": 9.988952191691925e-05, |
|
"loss": 2.903, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.14271356783919598, |
|
"grad_norm": 2.1383628845214844, |
|
"learning_rate": 9.987820251299122e-05, |
|
"loss": 2.6627, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.14472361809045226, |
|
"grad_norm": 2.0356757640838623, |
|
"learning_rate": 9.986633186170319e-05, |
|
"loss": 2.813, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.14673366834170853, |
|
"grad_norm": 2.4072749614715576, |
|
"learning_rate": 9.985391009424805e-05, |
|
"loss": 2.7325, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.1487437185929648, |
|
"grad_norm": 1.8525023460388184, |
|
"learning_rate": 9.984093734790956e-05, |
|
"loss": 2.7092, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.1507537688442211, |
|
"grad_norm": 1.8999531269073486, |
|
"learning_rate": 9.982741376606078e-05, |
|
"loss": 2.8637, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.1527638190954774, |
|
"grad_norm": 2.2264699935913086, |
|
"learning_rate": 9.981333949816259e-05, |
|
"loss": 2.8068, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.15477386934673368, |
|
"grad_norm": 2.1256349086761475, |
|
"learning_rate": 9.979871469976196e-05, |
|
"loss": 2.5258, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.15678391959798996, |
|
"grad_norm": 1.8756039142608643, |
|
"learning_rate": 9.978353953249022e-05, |
|
"loss": 2.7572, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.15879396984924624, |
|
"grad_norm": 2.4083592891693115, |
|
"learning_rate": 9.976781416406136e-05, |
|
"loss": 2.564, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.16080402010050251, |
|
"grad_norm": 1.9118754863739014, |
|
"learning_rate": 9.975153876827008e-05, |
|
"loss": 2.7158, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.1628140703517588, |
|
"grad_norm": 2.419102907180786, |
|
"learning_rate": 9.973471352498991e-05, |
|
"loss": 2.9075, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.16482412060301507, |
|
"grad_norm": 2.2566428184509277, |
|
"learning_rate": 9.971733862017126e-05, |
|
"loss": 2.7698, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.16683417085427135, |
|
"grad_norm": 1.980230450630188, |
|
"learning_rate": 9.969941424583926e-05, |
|
"loss": 2.6454, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.16884422110552763, |
|
"grad_norm": 2.0920395851135254, |
|
"learning_rate": 9.96809406000918e-05, |
|
"loss": 2.8594, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.1708542713567839, |
|
"grad_norm": 1.8870232105255127, |
|
"learning_rate": 9.966191788709716e-05, |
|
"loss": 2.5967, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.1728643216080402, |
|
"grad_norm": 2.334998607635498, |
|
"learning_rate": 9.964234631709187e-05, |
|
"loss": 2.661, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.1748743718592965, |
|
"grad_norm": 1.9071625471115112, |
|
"learning_rate": 9.962222610637837e-05, |
|
"loss": 2.6161, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.17688442211055277, |
|
"grad_norm": 2.015760898590088, |
|
"learning_rate": 9.960155747732259e-05, |
|
"loss": 2.5681, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.17889447236180905, |
|
"grad_norm": 1.827253818511963, |
|
"learning_rate": 9.958034065835151e-05, |
|
"loss": 2.5538, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.18090452261306533, |
|
"grad_norm": 1.9388582706451416, |
|
"learning_rate": 9.955857588395065e-05, |
|
"loss": 2.7098, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.1829145728643216, |
|
"grad_norm": 2.1864373683929443, |
|
"learning_rate": 9.95362633946614e-05, |
|
"loss": 2.5615, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.1849246231155779, |
|
"grad_norm": 2.093961477279663, |
|
"learning_rate": 9.951340343707852e-05, |
|
"loss": 2.7367, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.18693467336683417, |
|
"grad_norm": 1.9437264204025269, |
|
"learning_rate": 9.948999626384724e-05, |
|
"loss": 2.7414, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.18894472361809045, |
|
"grad_norm": 1.8001607656478882, |
|
"learning_rate": 9.946604213366057e-05, |
|
"loss": 2.7595, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.19095477386934673, |
|
"grad_norm": 1.8736133575439453, |
|
"learning_rate": 9.944154131125642e-05, |
|
"loss": 2.2943, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.192964824120603, |
|
"grad_norm": 1.8067212104797363, |
|
"learning_rate": 9.941649406741469e-05, |
|
"loss": 2.6118, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.19497487437185929, |
|
"grad_norm": 1.9442073106765747, |
|
"learning_rate": 9.939090067895422e-05, |
|
"loss": 2.5651, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.19698492462311556, |
|
"grad_norm": 1.976509690284729, |
|
"learning_rate": 9.936476142872979e-05, |
|
"loss": 2.5084, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.19899497487437187, |
|
"grad_norm": 1.9604930877685547, |
|
"learning_rate": 9.933807660562898e-05, |
|
"loss": 2.3689, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.20100502512562815, |
|
"grad_norm": 2.081547975540161, |
|
"learning_rate": 9.931084650456892e-05, |
|
"loss": 2.7038, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.20301507537688443, |
|
"grad_norm": 2.1766066551208496, |
|
"learning_rate": 9.928307142649316e-05, |
|
"loss": 2.8203, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.2050251256281407, |
|
"grad_norm": 2.217263698577881, |
|
"learning_rate": 9.925475167836821e-05, |
|
"loss": 2.5522, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.207035175879397, |
|
"grad_norm": 1.8877842426300049, |
|
"learning_rate": 9.922588757318021e-05, |
|
"loss": 2.6209, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.20904522613065327, |
|
"grad_norm": 1.8535610437393188, |
|
"learning_rate": 9.919647942993148e-05, |
|
"loss": 2.7805, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.21105527638190955, |
|
"grad_norm": 1.9754009246826172, |
|
"learning_rate": 9.916652757363698e-05, |
|
"loss": 2.7363, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.21306532663316582, |
|
"grad_norm": 2.0575201511383057, |
|
"learning_rate": 9.913603233532067e-05, |
|
"loss": 2.5019, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.2150753768844221, |
|
"grad_norm": 1.8286898136138916, |
|
"learning_rate": 9.910499405201195e-05, |
|
"loss": 2.5019, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.21708542713567838, |
|
"grad_norm": 1.954950213432312, |
|
"learning_rate": 9.907341306674185e-05, |
|
"loss": 2.5339, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.21909547738693466, |
|
"grad_norm": 1.8942043781280518, |
|
"learning_rate": 9.90412897285393e-05, |
|
"loss": 2.5065, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.22110552763819097, |
|
"grad_norm": 2.165287494659424, |
|
"learning_rate": 9.900862439242719e-05, |
|
"loss": 2.596, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.22311557788944725, |
|
"grad_norm": 1.8702832460403442, |
|
"learning_rate": 9.897541741941858e-05, |
|
"loss": 2.6656, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.22512562814070353, |
|
"grad_norm": 1.745975375175476, |
|
"learning_rate": 9.894166917651256e-05, |
|
"loss": 2.4078, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.2271356783919598, |
|
"grad_norm": 9.129441261291504, |
|
"learning_rate": 9.890738003669029e-05, |
|
"loss": 2.4112, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.22914572864321608, |
|
"grad_norm": 2.1922168731689453, |
|
"learning_rate": 9.887255037891086e-05, |
|
"loss": 2.5331, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.23115577889447236, |
|
"grad_norm": 1.9028476476669312, |
|
"learning_rate": 9.883718058810707e-05, |
|
"loss": 2.4778, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.23316582914572864, |
|
"grad_norm": 1.9489747285842896, |
|
"learning_rate": 9.880127105518122e-05, |
|
"loss": 2.6777, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.23517587939698492, |
|
"grad_norm": 1.922662377357483, |
|
"learning_rate": 9.876482217700078e-05, |
|
"loss": 2.5236, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.2371859296482412, |
|
"grad_norm": 1.9473645687103271, |
|
"learning_rate": 9.872783435639397e-05, |
|
"loss": 2.4375, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.23919597989949748, |
|
"grad_norm": 1.835803747177124, |
|
"learning_rate": 9.869030800214532e-05, |
|
"loss": 2.6191, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.24120603015075376, |
|
"grad_norm": 2.013805627822876, |
|
"learning_rate": 9.865224352899119e-05, |
|
"loss": 2.5062, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.24321608040201004, |
|
"grad_norm": 1.8618221282958984, |
|
"learning_rate": 9.861364135761517e-05, |
|
"loss": 2.3637, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.24522613065326634, |
|
"grad_norm": 2.0696043968200684, |
|
"learning_rate": 9.857450191464337e-05, |
|
"loss": 2.3941, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.24723618090452262, |
|
"grad_norm": 1.835046410560608, |
|
"learning_rate": 9.853482563263981e-05, |
|
"loss": 2.9119, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.2492462311557789, |
|
"grad_norm": 1.9712692499160767, |
|
"learning_rate": 9.849461295010156e-05, |
|
"loss": 2.4119, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.25125628140703515, |
|
"grad_norm": 2.0356907844543457, |
|
"learning_rate": 9.84538643114539e-05, |
|
"loss": 2.4488, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.25326633165829143, |
|
"grad_norm": 1.983216643333435, |
|
"learning_rate": 9.841258016704546e-05, |
|
"loss": 2.5618, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.2552763819095477, |
|
"grad_norm": 1.8130682706832886, |
|
"learning_rate": 9.837076097314319e-05, |
|
"loss": 2.6361, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.25728643216080405, |
|
"grad_norm": 1.7342140674591064, |
|
"learning_rate": 9.832840719192736e-05, |
|
"loss": 2.3223, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.2592964824120603, |
|
"grad_norm": 1.9762102365493774, |
|
"learning_rate": 9.82855192914864e-05, |
|
"loss": 2.4629, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.2613065326633166, |
|
"grad_norm": 1.8401470184326172, |
|
"learning_rate": 9.824209774581174e-05, |
|
"loss": 2.5557, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.2633165829145729, |
|
"grad_norm": 1.8549468517303467, |
|
"learning_rate": 9.819814303479267e-05, |
|
"loss": 2.6337, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.26532663316582916, |
|
"grad_norm": 2.0843794345855713, |
|
"learning_rate": 9.815365564421085e-05, |
|
"loss": 2.3528, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.26733668341708544, |
|
"grad_norm": 1.893712043762207, |
|
"learning_rate": 9.810863606573513e-05, |
|
"loss": 2.5509, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.2693467336683417, |
|
"grad_norm": 2.0922601222991943, |
|
"learning_rate": 9.806308479691595e-05, |
|
"loss": 2.6126, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.271356783919598, |
|
"grad_norm": 1.8790444135665894, |
|
"learning_rate": 9.801700234117999e-05, |
|
"loss": 2.5569, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.2733668341708543, |
|
"grad_norm": 1.8532637357711792, |
|
"learning_rate": 9.797038920782454e-05, |
|
"loss": 2.4948, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.27537688442211056, |
|
"grad_norm": 1.8589162826538086, |
|
"learning_rate": 9.792324591201179e-05, |
|
"loss": 2.4739, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.27738693467336684, |
|
"grad_norm": 1.8177707195281982, |
|
"learning_rate": 9.78755729747633e-05, |
|
"loss": 2.4883, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.2793969849246231, |
|
"grad_norm": 1.9046951532363892, |
|
"learning_rate": 9.782737092295413e-05, |
|
"loss": 2.331, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.2814070351758794, |
|
"grad_norm": 1.8969510793685913, |
|
"learning_rate": 9.777864028930705e-05, |
|
"loss": 2.4818, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.2834170854271357, |
|
"grad_norm": 2.1642754077911377, |
|
"learning_rate": 9.77293816123866e-05, |
|
"loss": 2.3212, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.28542713567839195, |
|
"grad_norm": 1.7480413913726807, |
|
"learning_rate": 9.767959543659326e-05, |
|
"loss": 2.5252, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.28743718592964823, |
|
"grad_norm": 2.0396628379821777, |
|
"learning_rate": 9.76292823121573e-05, |
|
"loss": 2.6501, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.2894472361809045, |
|
"grad_norm": 1.5687803030014038, |
|
"learning_rate": 9.757844279513281e-05, |
|
"loss": 2.3029, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.2914572864321608, |
|
"grad_norm": 1.9262704849243164, |
|
"learning_rate": 9.752707744739145e-05, |
|
"loss": 2.4645, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.29346733668341707, |
|
"grad_norm": 1.9408085346221924, |
|
"learning_rate": 9.747518683661631e-05, |
|
"loss": 2.3497, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.29547738693467335, |
|
"grad_norm": 1.78432297706604, |
|
"learning_rate": 9.742277153629564e-05, |
|
"loss": 2.2824, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.2974874371859296, |
|
"grad_norm": 2.070885181427002, |
|
"learning_rate": 9.736983212571646e-05, |
|
"loss": 2.5931, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.2994974874371859, |
|
"grad_norm": 1.657235026359558, |
|
"learning_rate": 9.731636918995821e-05, |
|
"loss": 2.4344, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.3015075376884422, |
|
"grad_norm": 1.8496333360671997, |
|
"learning_rate": 9.726238331988624e-05, |
|
"loss": 2.4043, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3035175879396985, |
|
"grad_norm": 1.8144067525863647, |
|
"learning_rate": 9.720787511214533e-05, |
|
"loss": 2.5708, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.3055276381909548, |
|
"grad_norm": 1.875706434249878, |
|
"learning_rate": 9.715284516915303e-05, |
|
"loss": 2.387, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.3075376884422111, |
|
"grad_norm": 1.905147910118103, |
|
"learning_rate": 9.709729409909307e-05, |
|
"loss": 2.6459, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.30954773869346736, |
|
"grad_norm": 1.919798493385315, |
|
"learning_rate": 9.704122251590862e-05, |
|
"loss": 2.2947, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.31155778894472363, |
|
"grad_norm": 1.5951811075210571, |
|
"learning_rate": 9.698463103929542e-05, |
|
"loss": 2.3532, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.3135678391959799, |
|
"grad_norm": 1.6839174032211304, |
|
"learning_rate": 9.692752029469512e-05, |
|
"loss": 2.2152, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.3155778894472362, |
|
"grad_norm": 2.0012948513031006, |
|
"learning_rate": 9.686989091328813e-05, |
|
"loss": 2.3561, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.31758793969849247, |
|
"grad_norm": 1.8996232748031616, |
|
"learning_rate": 9.681174353198687e-05, |
|
"loss": 2.3638, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.31959798994974875, |
|
"grad_norm": 1.8173401355743408, |
|
"learning_rate": 9.675307879342854e-05, |
|
"loss": 2.2213, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.32160804020100503, |
|
"grad_norm": 1.6612550020217896, |
|
"learning_rate": 9.669389734596819e-05, |
|
"loss": 2.5352, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.3236180904522613, |
|
"grad_norm": 1.8817943334579468, |
|
"learning_rate": 9.663419984367139e-05, |
|
"loss": 2.5058, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.3256281407035176, |
|
"grad_norm": 2.1160693168640137, |
|
"learning_rate": 9.657398694630712e-05, |
|
"loss": 2.2973, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.32763819095477387, |
|
"grad_norm": 1.738549828529358, |
|
"learning_rate": 9.651325931934046e-05, |
|
"loss": 2.2865, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.32964824120603015, |
|
"grad_norm": 1.8293092250823975, |
|
"learning_rate": 9.645201763392513e-05, |
|
"loss": 2.1597, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.3316582914572864, |
|
"grad_norm": 1.9268925189971924, |
|
"learning_rate": 9.639026256689628e-05, |
|
"loss": 2.5048, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.3336683417085427, |
|
"grad_norm": 1.983359932899475, |
|
"learning_rate": 9.632799480076278e-05, |
|
"loss": 2.3261, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.335678391959799, |
|
"grad_norm": 1.717955470085144, |
|
"learning_rate": 9.626521502369984e-05, |
|
"loss": 2.6444, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.33768844221105526, |
|
"grad_norm": 1.836305856704712, |
|
"learning_rate": 9.620192392954132e-05, |
|
"loss": 2.4493, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.33969849246231154, |
|
"grad_norm": 1.8819546699523926, |
|
"learning_rate": 9.613812221777212e-05, |
|
"loss": 2.6608, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.3417085427135678, |
|
"grad_norm": 1.845680832862854, |
|
"learning_rate": 9.607381059352038e-05, |
|
"loss": 2.2059, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.3437185929648241, |
|
"grad_norm": 1.878606915473938, |
|
"learning_rate": 9.600898976754977e-05, |
|
"loss": 2.2733, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.3457286432160804, |
|
"grad_norm": 1.768790364265442, |
|
"learning_rate": 9.594366045625154e-05, |
|
"loss": 2.3773, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.34773869346733666, |
|
"grad_norm": 1.761987328529358, |
|
"learning_rate": 9.587782338163669e-05, |
|
"loss": 2.5148, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.349748743718593, |
|
"grad_norm": 1.6543912887573242, |
|
"learning_rate": 9.581147927132797e-05, |
|
"loss": 2.4609, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.35175879396984927, |
|
"grad_norm": 1.9013664722442627, |
|
"learning_rate": 9.574462885855174e-05, |
|
"loss": 2.121, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.35376884422110555, |
|
"grad_norm": 3.302755832672119, |
|
"learning_rate": 9.567727288213005e-05, |
|
"loss": 2.4353, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.35577889447236183, |
|
"grad_norm": 2.773200750350952, |
|
"learning_rate": 9.560941208647231e-05, |
|
"loss": 2.3111, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.3577889447236181, |
|
"grad_norm": 1.7840498685836792, |
|
"learning_rate": 9.554104722156716e-05, |
|
"loss": 2.5435, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.3597989949748744, |
|
"grad_norm": 1.8287582397460938, |
|
"learning_rate": 9.547217904297411e-05, |
|
"loss": 2.3461, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.36180904522613067, |
|
"grad_norm": 1.8139948844909668, |
|
"learning_rate": 9.540280831181525e-05, |
|
"loss": 2.3423, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.36381909547738694, |
|
"grad_norm": 1.8012893199920654, |
|
"learning_rate": 9.533293579476683e-05, |
|
"loss": 2.412, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.3658291457286432, |
|
"grad_norm": 1.8645695447921753, |
|
"learning_rate": 9.526256226405075e-05, |
|
"loss": 2.5261, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.3678391959798995, |
|
"grad_norm": 1.9607142210006714, |
|
"learning_rate": 9.519168849742604e-05, |
|
"loss": 2.6721, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.3698492462311558, |
|
"grad_norm": 1.772139549255371, |
|
"learning_rate": 9.512031527818028e-05, |
|
"loss": 2.3517, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.37185929648241206, |
|
"grad_norm": 1.8913545608520508, |
|
"learning_rate": 9.504844339512095e-05, |
|
"loss": 2.6095, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.37386934673366834, |
|
"grad_norm": 1.8451741933822632, |
|
"learning_rate": 9.497607364256672e-05, |
|
"loss": 2.5707, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.3758793969849246, |
|
"grad_norm": 1.8256680965423584, |
|
"learning_rate": 9.490320682033855e-05, |
|
"loss": 2.3579, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.3778894472361809, |
|
"grad_norm": 1.9581091403961182, |
|
"learning_rate": 9.482984373375105e-05, |
|
"loss": 2.5629, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.3798994974874372, |
|
"grad_norm": 1.998180627822876, |
|
"learning_rate": 9.475598519360344e-05, |
|
"loss": 2.3817, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.38190954773869346, |
|
"grad_norm": 1.7146025896072388, |
|
"learning_rate": 9.468163201617062e-05, |
|
"loss": 2.2208, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.38391959798994973, |
|
"grad_norm": 1.8821600675582886, |
|
"learning_rate": 9.460678502319418e-05, |
|
"loss": 2.4416, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.385929648241206, |
|
"grad_norm": 1.7585586309432983, |
|
"learning_rate": 9.453144504187327e-05, |
|
"loss": 2.3008, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.3879396984924623, |
|
"grad_norm": 1.6485331058502197, |
|
"learning_rate": 9.445561290485549e-05, |
|
"loss": 2.383, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.38994974874371857, |
|
"grad_norm": 1.9249005317687988, |
|
"learning_rate": 9.437928945022771e-05, |
|
"loss": 2.4204, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.39195979899497485, |
|
"grad_norm": 1.8952553272247314, |
|
"learning_rate": 9.430247552150673e-05, |
|
"loss": 2.2751, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.39396984924623113, |
|
"grad_norm": 2.2544572353363037, |
|
"learning_rate": 9.422517196763002e-05, |
|
"loss": 2.4784, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.39597989949748746, |
|
"grad_norm": 2.308283567428589, |
|
"learning_rate": 9.414737964294636e-05, |
|
"loss": 2.2052, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.39798994974874374, |
|
"grad_norm": 1.8085863590240479, |
|
"learning_rate": 9.40690994072063e-05, |
|
"loss": 2.2588, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.9807807207107544, |
|
"learning_rate": 9.399033212555275e-05, |
|
"loss": 2.2698, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.4020100502512563, |
|
"grad_norm": 1.6799061298370361, |
|
"learning_rate": 9.391107866851143e-05, |
|
"loss": 2.2507, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.4040201005025126, |
|
"grad_norm": 1.6821558475494385, |
|
"learning_rate": 9.383133991198112e-05, |
|
"loss": 2.386, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.40603015075376886, |
|
"grad_norm": 1.7488336563110352, |
|
"learning_rate": 9.375111673722414e-05, |
|
"loss": 2.7501, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.40804020100502514, |
|
"grad_norm": 1.7495909929275513, |
|
"learning_rate": 9.367041003085649e-05, |
|
"loss": 2.4578, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.4100502512562814, |
|
"grad_norm": 1.773022174835205, |
|
"learning_rate": 9.358922068483812e-05, |
|
"loss": 2.1896, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.4120603015075377, |
|
"grad_norm": 1.6720446348190308, |
|
"learning_rate": 9.350754959646306e-05, |
|
"loss": 2.2264, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.414070351758794, |
|
"grad_norm": 1.8488444089889526, |
|
"learning_rate": 9.342539766834946e-05, |
|
"loss": 2.5527, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.41608040201005025, |
|
"grad_norm": 1.7335339784622192, |
|
"learning_rate": 9.334276580842967e-05, |
|
"loss": 2.2894, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.41809045226130653, |
|
"grad_norm": 1.670088529586792, |
|
"learning_rate": 9.325965492994018e-05, |
|
"loss": 2.1557, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.4201005025125628, |
|
"grad_norm": 1.6430253982543945, |
|
"learning_rate": 9.317606595141154e-05, |
|
"loss": 2.2405, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.4221105527638191, |
|
"grad_norm": 1.7390164136886597, |
|
"learning_rate": 9.30919997966582e-05, |
|
"loss": 2.3486, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.42412060301507537, |
|
"grad_norm": 1.7880635261535645, |
|
"learning_rate": 9.300745739476829e-05, |
|
"loss": 2.2626, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.42613065326633165, |
|
"grad_norm": 1.7333531379699707, |
|
"learning_rate": 9.292243968009331e-05, |
|
"loss": 2.2424, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.42814070351758793, |
|
"grad_norm": 1.6569937467575073, |
|
"learning_rate": 9.283694759223796e-05, |
|
"loss": 2.1534, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.4301507537688442, |
|
"grad_norm": 1.7097984552383423, |
|
"learning_rate": 9.275098207604957e-05, |
|
"loss": 2.2569, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.4321608040201005, |
|
"grad_norm": 1.7049428224563599, |
|
"learning_rate": 9.266454408160779e-05, |
|
"loss": 2.4531, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.43417085427135677, |
|
"grad_norm": 1.5478094816207886, |
|
"learning_rate": 9.257763456421398e-05, |
|
"loss": 2.2312, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.43618090452261304, |
|
"grad_norm": 1.706602692604065, |
|
"learning_rate": 9.249025448438076e-05, |
|
"loss": 2.4432, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.4381909547738693, |
|
"grad_norm": 1.701011300086975, |
|
"learning_rate": 9.24024048078213e-05, |
|
"loss": 2.1473, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.4402010050251256, |
|
"grad_norm": 1.7655497789382935, |
|
"learning_rate": 9.231408650543874e-05, |
|
"loss": 2.433, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.44221105527638194, |
|
"grad_norm": 1.853971004486084, |
|
"learning_rate": 9.22253005533154e-05, |
|
"loss": 2.3029, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.4442211055276382, |
|
"grad_norm": 1.8369559049606323, |
|
"learning_rate": 9.213604793270196e-05, |
|
"loss": 2.1837, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.4462311557788945, |
|
"grad_norm": 1.6373860836029053, |
|
"learning_rate": 9.204632963000671e-05, |
|
"loss": 2.4237, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.4482412060301508, |
|
"grad_norm": 1.7693603038787842, |
|
"learning_rate": 9.195614663678458e-05, |
|
"loss": 2.4229, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.45025125628140705, |
|
"grad_norm": 1.6722673177719116, |
|
"learning_rate": 9.186549994972618e-05, |
|
"loss": 2.3788, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.45226130653266333, |
|
"grad_norm": 1.6546918153762817, |
|
"learning_rate": 9.177439057064683e-05, |
|
"loss": 2.2079, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.4542713567839196, |
|
"grad_norm": 1.8333224058151245, |
|
"learning_rate": 9.168281950647545e-05, |
|
"loss": 2.5018, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.4562814070351759, |
|
"grad_norm": 1.7532380819320679, |
|
"learning_rate": 9.159078776924346e-05, |
|
"loss": 2.0344, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.45829145728643217, |
|
"grad_norm": 1.647209644317627, |
|
"learning_rate": 9.149829637607353e-05, |
|
"loss": 2.2324, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.46030150753768845, |
|
"grad_norm": 1.6968568563461304, |
|
"learning_rate": 9.140534634916846e-05, |
|
"loss": 2.1635, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.4623115577889447, |
|
"grad_norm": 1.6668813228607178, |
|
"learning_rate": 9.131193871579975e-05, |
|
"loss": 2.334, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.464321608040201, |
|
"grad_norm": 1.6174511909484863, |
|
"learning_rate": 9.121807450829632e-05, |
|
"loss": 2.1475, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.4663316582914573, |
|
"grad_norm": 1.8468537330627441, |
|
"learning_rate": 9.112375476403312e-05, |
|
"loss": 2.2984, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.46834170854271356, |
|
"grad_norm": 1.7187474966049194, |
|
"learning_rate": 9.102898052541958e-05, |
|
"loss": 2.4139, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.47035175879396984, |
|
"grad_norm": 1.7762575149536133, |
|
"learning_rate": 9.093375283988819e-05, |
|
"loss": 2.1496, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.4723618090452261, |
|
"grad_norm": 1.9211578369140625, |
|
"learning_rate": 9.083807275988284e-05, |
|
"loss": 2.1667, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.4743718592964824, |
|
"grad_norm": 3.2748539447784424, |
|
"learning_rate": 9.074194134284726e-05, |
|
"loss": 2.2991, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.4763819095477387, |
|
"grad_norm": 1.751917839050293, |
|
"learning_rate": 9.064535965121324e-05, |
|
"loss": 2.2903, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.47839195979899496, |
|
"grad_norm": 1.7564281225204468, |
|
"learning_rate": 9.054832875238903e-05, |
|
"loss": 2.5954, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.48040201005025124, |
|
"grad_norm": 1.6731312274932861, |
|
"learning_rate": 9.045084971874738e-05, |
|
"loss": 2.3209, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.4824120603015075, |
|
"grad_norm": 2.067354679107666, |
|
"learning_rate": 9.035292362761381e-05, |
|
"loss": 2.1253, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.4844221105527638, |
|
"grad_norm": 1.6500976085662842, |
|
"learning_rate": 9.025455156125466e-05, |
|
"loss": 2.0942, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.4864321608040201, |
|
"grad_norm": 1.7590264081954956, |
|
"learning_rate": 9.015573460686509e-05, |
|
"loss": 2.3827, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.4884422110552764, |
|
"grad_norm": 1.8388062715530396, |
|
"learning_rate": 9.005647385655718e-05, |
|
"loss": 2.2311, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.4904522613065327, |
|
"grad_norm": 1.7108007669448853, |
|
"learning_rate": 8.995677040734769e-05, |
|
"loss": 2.1734, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.49246231155778897, |
|
"grad_norm": 2.4704267978668213, |
|
"learning_rate": 8.985662536114613e-05, |
|
"loss": 2.2604, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.49447236180904525, |
|
"grad_norm": 1.7523339986801147, |
|
"learning_rate": 8.97560398247424e-05, |
|
"loss": 2.3831, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.4964824120603015, |
|
"grad_norm": 1.5778342485427856, |
|
"learning_rate": 8.965501490979467e-05, |
|
"loss": 2.118, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.4984924623115578, |
|
"grad_norm": 1.5618497133255005, |
|
"learning_rate": 8.955355173281708e-05, |
|
"loss": 2.3317, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.5005025125628141, |
|
"grad_norm": 2.049302339553833, |
|
"learning_rate": 8.945165141516734e-05, |
|
"loss": 2.2127, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.5025125628140703, |
|
"grad_norm": 2.8442394733428955, |
|
"learning_rate": 8.934931508303445e-05, |
|
"loss": 2.1562, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5045226130653266, |
|
"grad_norm": 1.7337098121643066, |
|
"learning_rate": 8.924654386742613e-05, |
|
"loss": 2.2541, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.5065326633165829, |
|
"grad_norm": 1.6087921857833862, |
|
"learning_rate": 8.914333890415639e-05, |
|
"loss": 2.3858, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.5085427135678392, |
|
"grad_norm": 1.7101848125457764, |
|
"learning_rate": 8.903970133383297e-05, |
|
"loss": 2.0366, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.5105527638190954, |
|
"grad_norm": 1.640261173248291, |
|
"learning_rate": 8.89356323018447e-05, |
|
"loss": 2.3109, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.5125628140703518, |
|
"grad_norm": 1.7887450456619263, |
|
"learning_rate": 8.883113295834892e-05, |
|
"loss": 2.23, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.5145728643216081, |
|
"grad_norm": 1.772544503211975, |
|
"learning_rate": 8.872620445825868e-05, |
|
"loss": 2.3887, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.5165829145728643, |
|
"grad_norm": 1.6293214559555054, |
|
"learning_rate": 8.862084796122998e-05, |
|
"loss": 2.2669, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.5185929648241207, |
|
"grad_norm": 1.9295259714126587, |
|
"learning_rate": 8.851506463164907e-05, |
|
"loss": 2.4984, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.5206030150753769, |
|
"grad_norm": 1.5557610988616943, |
|
"learning_rate": 8.84088556386194e-05, |
|
"loss": 2.2621, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.5226130653266332, |
|
"grad_norm": 1.9299404621124268, |
|
"learning_rate": 8.83022221559489e-05, |
|
"loss": 2.5963, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.5246231155778894, |
|
"grad_norm": 1.7403899431228638, |
|
"learning_rate": 8.819516536213683e-05, |
|
"loss": 2.261, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.5266331658291458, |
|
"grad_norm": 1.6017705202102661, |
|
"learning_rate": 8.808768644036085e-05, |
|
"loss": 2.1345, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.528643216080402, |
|
"grad_norm": 1.560952067375183, |
|
"learning_rate": 8.797978657846391e-05, |
|
"loss": 2.1304, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.5306532663316583, |
|
"grad_norm": 1.63511061668396, |
|
"learning_rate": 8.787146696894118e-05, |
|
"loss": 2.2639, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.5326633165829145, |
|
"grad_norm": 1.6902211904525757, |
|
"learning_rate": 8.776272880892675e-05, |
|
"loss": 2.4526, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.5346733668341709, |
|
"grad_norm": 2.2787492275238037, |
|
"learning_rate": 8.765357330018056e-05, |
|
"loss": 2.1836, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.5366834170854271, |
|
"grad_norm": 1.6377997398376465, |
|
"learning_rate": 8.754400164907497e-05, |
|
"loss": 2.1207, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.5386934673366834, |
|
"grad_norm": 1.829947829246521, |
|
"learning_rate": 8.74340150665815e-05, |
|
"loss": 2.4133, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.5407035175879397, |
|
"grad_norm": 1.8088699579238892, |
|
"learning_rate": 8.732361476825752e-05, |
|
"loss": 2.4477, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 0.542713567839196, |
|
"grad_norm": 1.8463410139083862, |
|
"learning_rate": 8.721280197423258e-05, |
|
"loss": 2.5268, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.5447236180904522, |
|
"grad_norm": 1.7526369094848633, |
|
"learning_rate": 8.710157790919522e-05, |
|
"loss": 2.2297, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 0.5467336683417086, |
|
"grad_norm": 1.548508882522583, |
|
"learning_rate": 8.69899438023792e-05, |
|
"loss": 2.3638, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.5487437185929648, |
|
"grad_norm": 1.6327900886535645, |
|
"learning_rate": 8.687790088755008e-05, |
|
"loss": 2.5126, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.5507537688442211, |
|
"grad_norm": 4.006368160247803, |
|
"learning_rate": 8.676545040299145e-05, |
|
"loss": 2.4828, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 0.5527638190954773, |
|
"grad_norm": 1.6274868249893188, |
|
"learning_rate": 8.665259359149132e-05, |
|
"loss": 2.2662, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.5547738693467337, |
|
"grad_norm": 1.686354398727417, |
|
"learning_rate": 8.653933170032842e-05, |
|
"loss": 2.4134, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.5567839195979899, |
|
"grad_norm": 1.62911856174469, |
|
"learning_rate": 8.642566598125831e-05, |
|
"loss": 2.275, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 0.5587939698492462, |
|
"grad_norm": 1.634544849395752, |
|
"learning_rate": 8.631159769049965e-05, |
|
"loss": 2.2874, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 0.5608040201005026, |
|
"grad_norm": 1.5775830745697021, |
|
"learning_rate": 8.619712808872024e-05, |
|
"loss": 2.2704, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 0.5628140703517588, |
|
"grad_norm": 1.618634581565857, |
|
"learning_rate": 8.60822584410231e-05, |
|
"loss": 2.545, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.5648241206030151, |
|
"grad_norm": 1.692204475402832, |
|
"learning_rate": 8.596699001693255e-05, |
|
"loss": 2.2853, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 0.5668341708542713, |
|
"grad_norm": 1.717124342918396, |
|
"learning_rate": 8.585132409038013e-05, |
|
"loss": 2.1552, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 0.5688442211055277, |
|
"grad_norm": 1.6397979259490967, |
|
"learning_rate": 8.573526193969046e-05, |
|
"loss": 2.3056, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 0.5708542713567839, |
|
"grad_norm": 1.6189571619033813, |
|
"learning_rate": 8.561880484756725e-05, |
|
"loss": 2.1406, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 0.5728643216080402, |
|
"grad_norm": 1.9604424238204956, |
|
"learning_rate": 8.550195410107902e-05, |
|
"loss": 2.4183, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.5748743718592965, |
|
"grad_norm": 1.6223558187484741, |
|
"learning_rate": 8.538471099164493e-05, |
|
"loss": 2.1032, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 0.5768844221105528, |
|
"grad_norm": 1.8475818634033203, |
|
"learning_rate": 8.526707681502044e-05, |
|
"loss": 2.1361, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 0.578894472361809, |
|
"grad_norm": 1.6998199224472046, |
|
"learning_rate": 8.51490528712831e-05, |
|
"loss": 2.0701, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.5809045226130654, |
|
"grad_norm": 1.5956889390945435, |
|
"learning_rate": 8.503064046481803e-05, |
|
"loss": 2.3293, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 0.5829145728643216, |
|
"grad_norm": 1.544945478439331, |
|
"learning_rate": 8.491184090430364e-05, |
|
"loss": 2.0934, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.5849246231155779, |
|
"grad_norm": 1.5254535675048828, |
|
"learning_rate": 8.479265550269714e-05, |
|
"loss": 1.9323, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 0.5869346733668341, |
|
"grad_norm": 1.6815849542617798, |
|
"learning_rate": 8.467308557721996e-05, |
|
"loss": 2.2834, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 0.5889447236180905, |
|
"grad_norm": 1.7338416576385498, |
|
"learning_rate": 8.455313244934324e-05, |
|
"loss": 2.2117, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 0.5909547738693467, |
|
"grad_norm": 1.6180495023727417, |
|
"learning_rate": 8.443279744477324e-05, |
|
"loss": 2.1526, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.592964824120603, |
|
"grad_norm": 1.6239418983459473, |
|
"learning_rate": 8.43120818934367e-05, |
|
"loss": 2.1047, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.5949748743718593, |
|
"grad_norm": 1.6448568105697632, |
|
"learning_rate": 8.419098712946601e-05, |
|
"loss": 2.0582, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 0.5969849246231156, |
|
"grad_norm": 1.722649097442627, |
|
"learning_rate": 8.406951449118469e-05, |
|
"loss": 1.9733, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 0.5989949748743718, |
|
"grad_norm": 1.7211109399795532, |
|
"learning_rate": 8.394766532109242e-05, |
|
"loss": 2.2033, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 0.6010050251256281, |
|
"grad_norm": 1.55821692943573, |
|
"learning_rate": 8.382544096585027e-05, |
|
"loss": 1.8462, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 0.6030150753768844, |
|
"grad_norm": 1.5059350728988647, |
|
"learning_rate": 8.370284277626577e-05, |
|
"loss": 2.0388, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6050251256281407, |
|
"grad_norm": 1.7469459772109985, |
|
"learning_rate": 8.357987210727808e-05, |
|
"loss": 2.367, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 0.607035175879397, |
|
"grad_norm": 1.6946009397506714, |
|
"learning_rate": 8.345653031794292e-05, |
|
"loss": 2.3654, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 0.6090452261306533, |
|
"grad_norm": 2.0396924018859863, |
|
"learning_rate": 8.333281877141758e-05, |
|
"loss": 2.2726, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 0.6110552763819096, |
|
"grad_norm": 1.6228737831115723, |
|
"learning_rate": 8.320873883494585e-05, |
|
"loss": 2.2077, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 0.6130653266331658, |
|
"grad_norm": 1.6517311334609985, |
|
"learning_rate": 8.308429187984297e-05, |
|
"loss": 2.2134, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.6150753768844222, |
|
"grad_norm": 1.6150524616241455, |
|
"learning_rate": 8.295947928148036e-05, |
|
"loss": 2.1454, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 0.6170854271356784, |
|
"grad_norm": 1.5641722679138184, |
|
"learning_rate": 8.283430241927052e-05, |
|
"loss": 1.9578, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 0.6190954773869347, |
|
"grad_norm": 1.668264389038086, |
|
"learning_rate": 8.270876267665173e-05, |
|
"loss": 2.1463, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.6211055276381909, |
|
"grad_norm": 1.6941081285476685, |
|
"learning_rate": 8.258286144107276e-05, |
|
"loss": 2.2608, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 0.6231155778894473, |
|
"grad_norm": 1.9184767007827759, |
|
"learning_rate": 8.24566001039776e-05, |
|
"loss": 2.2404, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.6251256281407035, |
|
"grad_norm": 1.5730127096176147, |
|
"learning_rate": 8.232998006078997e-05, |
|
"loss": 2.1935, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 0.6271356783919598, |
|
"grad_norm": 1.4828630685806274, |
|
"learning_rate": 8.220300271089807e-05, |
|
"loss": 2.2226, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 0.629145728643216, |
|
"grad_norm": 1.5179678201675415, |
|
"learning_rate": 8.207566945763885e-05, |
|
"loss": 2.0261, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 0.6311557788944724, |
|
"grad_norm": 1.4846012592315674, |
|
"learning_rate": 8.19479817082828e-05, |
|
"loss": 2.1515, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 0.6331658291457286, |
|
"grad_norm": 1.6346242427825928, |
|
"learning_rate": 8.181994087401819e-05, |
|
"loss": 2.0147, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.6351758793969849, |
|
"grad_norm": 1.5853772163391113, |
|
"learning_rate": 8.169154836993551e-05, |
|
"loss": 2.1816, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 0.6371859296482412, |
|
"grad_norm": 1.573282241821289, |
|
"learning_rate": 8.156280561501195e-05, |
|
"loss": 2.0802, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 0.6391959798994975, |
|
"grad_norm": 1.5579333305358887, |
|
"learning_rate": 8.143371403209554e-05, |
|
"loss": 2.1474, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 0.6412060301507537, |
|
"grad_norm": 1.6341716051101685, |
|
"learning_rate": 8.130427504788955e-05, |
|
"loss": 2.293, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 0.6432160804020101, |
|
"grad_norm": 1.6196293830871582, |
|
"learning_rate": 8.117449009293668e-05, |
|
"loss": 2.0744, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.6452261306532663, |
|
"grad_norm": 1.702324628829956, |
|
"learning_rate": 8.104436060160324e-05, |
|
"loss": 2.158, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 0.6472361809045226, |
|
"grad_norm": 1.734678030014038, |
|
"learning_rate": 8.091388801206333e-05, |
|
"loss": 2.2079, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 0.6492462311557788, |
|
"grad_norm": 1.667091727256775, |
|
"learning_rate": 8.07830737662829e-05, |
|
"loss": 2.1447, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 0.6512562814070352, |
|
"grad_norm": 1.7557045221328735, |
|
"learning_rate": 8.06519193100039e-05, |
|
"loss": 2.2181, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 0.6532663316582915, |
|
"grad_norm": 1.6288976669311523, |
|
"learning_rate": 8.052042609272817e-05, |
|
"loss": 2.0902, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.6552763819095477, |
|
"grad_norm": 1.6819061040878296, |
|
"learning_rate": 8.038859556770151e-05, |
|
"loss": 2.3705, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 0.6572864321608041, |
|
"grad_norm": 1.7110100984573364, |
|
"learning_rate": 8.025642919189762e-05, |
|
"loss": 2.1783, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 0.6592964824120603, |
|
"grad_norm": 1.7823935747146606, |
|
"learning_rate": 8.012392842600198e-05, |
|
"loss": 2.3344, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 0.6613065326633166, |
|
"grad_norm": 1.661436915397644, |
|
"learning_rate": 7.99910947343957e-05, |
|
"loss": 2.2976, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 0.6633165829145728, |
|
"grad_norm": 1.6022391319274902, |
|
"learning_rate": 7.985792958513931e-05, |
|
"loss": 1.9313, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.6653266331658292, |
|
"grad_norm": 1.5373127460479736, |
|
"learning_rate": 7.972443444995663e-05, |
|
"loss": 2.182, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 0.6673366834170854, |
|
"grad_norm": 1.713374376296997, |
|
"learning_rate": 7.959061080421839e-05, |
|
"loss": 2.3038, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 0.6693467336683417, |
|
"grad_norm": 1.645225167274475, |
|
"learning_rate": 7.9456460126926e-05, |
|
"loss": 2.2538, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 0.671356783919598, |
|
"grad_norm": 1.480999231338501, |
|
"learning_rate": 7.932198390069515e-05, |
|
"loss": 1.9686, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 0.6733668341708543, |
|
"grad_norm": 3.5348572731018066, |
|
"learning_rate": 7.91871836117395e-05, |
|
"loss": 2.3065, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.6753768844221105, |
|
"grad_norm": 1.6705471277236938, |
|
"learning_rate": 7.905206074985416e-05, |
|
"loss": 2.1366, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 0.6773869346733669, |
|
"grad_norm": 1.6539703607559204, |
|
"learning_rate": 7.891661680839932e-05, |
|
"loss": 2.111, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 0.6793969849246231, |
|
"grad_norm": 1.5140559673309326, |
|
"learning_rate": 7.878085328428369e-05, |
|
"loss": 2.1999, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 0.6814070351758794, |
|
"grad_norm": 1.6765365600585938, |
|
"learning_rate": 7.86447716779479e-05, |
|
"loss": 1.9244, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 0.6834170854271356, |
|
"grad_norm": 1.6503502130508423, |
|
"learning_rate": 7.85083734933481e-05, |
|
"loss": 2.0321, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.685427135678392, |
|
"grad_norm": 1.5550287961959839, |
|
"learning_rate": 7.83716602379391e-05, |
|
"loss": 1.9953, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 0.6874371859296482, |
|
"grad_norm": 1.5494416952133179, |
|
"learning_rate": 7.823463342265792e-05, |
|
"loss": 2.1868, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 0.6894472361809045, |
|
"grad_norm": 1.5480455160140991, |
|
"learning_rate": 7.809729456190698e-05, |
|
"loss": 2.2091, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 0.6914572864321608, |
|
"grad_norm": 1.5905412435531616, |
|
"learning_rate": 7.795964517353735e-05, |
|
"loss": 2.0997, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 0.6934673366834171, |
|
"grad_norm": 1.6843619346618652, |
|
"learning_rate": 7.782168677883206e-05, |
|
"loss": 2.2762, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.6954773869346733, |
|
"grad_norm": 1.9944095611572266, |
|
"learning_rate": 7.76834209024892e-05, |
|
"loss": 1.8889, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 0.6974874371859296, |
|
"grad_norm": 1.6074081659317017, |
|
"learning_rate": 7.754484907260513e-05, |
|
"loss": 2.2474, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 0.699497487437186, |
|
"grad_norm": 1.506535530090332, |
|
"learning_rate": 7.740597282065756e-05, |
|
"loss": 2.1043, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 0.7015075376884422, |
|
"grad_norm": 1.5604645013809204, |
|
"learning_rate": 7.726679368148864e-05, |
|
"loss": 2.0762, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 0.7035175879396985, |
|
"grad_norm": 1.6519221067428589, |
|
"learning_rate": 7.712731319328798e-05, |
|
"loss": 2.123, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7055276381909548, |
|
"grad_norm": 1.775421142578125, |
|
"learning_rate": 7.698753289757565e-05, |
|
"loss": 2.125, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 0.7075376884422111, |
|
"grad_norm": 1.552018404006958, |
|
"learning_rate": 7.684745433918518e-05, |
|
"loss": 1.952, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 0.7095477386934673, |
|
"grad_norm": 1.8088091611862183, |
|
"learning_rate": 7.670707906624644e-05, |
|
"loss": 2.2094, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 0.7115577889447237, |
|
"grad_norm": 1.5663000345230103, |
|
"learning_rate": 7.656640863016857e-05, |
|
"loss": 2.2196, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 0.7135678391959799, |
|
"grad_norm": 1.6653417348861694, |
|
"learning_rate": 7.642544458562278e-05, |
|
"loss": 2.1193, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.7155778894472362, |
|
"grad_norm": 1.5582131147384644, |
|
"learning_rate": 7.628418849052523e-05, |
|
"loss": 2.0609, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 0.7175879396984924, |
|
"grad_norm": 1.6001904010772705, |
|
"learning_rate": 7.614264190601981e-05, |
|
"loss": 1.9337, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 0.7195979899497488, |
|
"grad_norm": 1.5969533920288086, |
|
"learning_rate": 7.600080639646077e-05, |
|
"loss": 1.9728, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 0.721608040201005, |
|
"grad_norm": 1.5485557317733765, |
|
"learning_rate": 7.585868352939563e-05, |
|
"loss": 2.0791, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 0.7236180904522613, |
|
"grad_norm": 1.6326680183410645, |
|
"learning_rate": 7.571627487554769e-05, |
|
"loss": 2.1784, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.7256281407035176, |
|
"grad_norm": 1.7753201723098755, |
|
"learning_rate": 7.55735820087987e-05, |
|
"loss": 2.1085, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 0.7276381909547739, |
|
"grad_norm": 1.6765743494033813, |
|
"learning_rate": 7.543060650617158e-05, |
|
"loss": 2.2751, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 0.7296482412060301, |
|
"grad_norm": 1.7378075122833252, |
|
"learning_rate": 7.528734994781283e-05, |
|
"loss": 2.5291, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 0.7316582914572864, |
|
"grad_norm": 1.5884289741516113, |
|
"learning_rate": 7.514381391697517e-05, |
|
"loss": 2.1576, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 0.7336683417085427, |
|
"grad_norm": 1.4826475381851196, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 2.0913, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.735678391959799, |
|
"grad_norm": 1.5756173133850098, |
|
"learning_rate": 7.48559097862999e-05, |
|
"loss": 1.977, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 0.7376884422110552, |
|
"grad_norm": 1.5923422574996948, |
|
"learning_rate": 7.471154486834105e-05, |
|
"loss": 2.1248, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 0.7396984924623116, |
|
"grad_norm": 1.8260492086410522, |
|
"learning_rate": 7.456690684162557e-05, |
|
"loss": 2.1871, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 0.7417085427135678, |
|
"grad_norm": 1.6289384365081787, |
|
"learning_rate": 7.442199730467402e-05, |
|
"loss": 1.9642, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 0.7437185929648241, |
|
"grad_norm": 1.423568606376648, |
|
"learning_rate": 7.427681785900761e-05, |
|
"loss": 1.7796, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.7457286432160805, |
|
"grad_norm": 1.7022663354873657, |
|
"learning_rate": 7.413137010913054e-05, |
|
"loss": 2.2234, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 0.7477386934673367, |
|
"grad_norm": 1.6905540227890015, |
|
"learning_rate": 7.398565566251232e-05, |
|
"loss": 2.1867, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 0.749748743718593, |
|
"grad_norm": 1.545426607131958, |
|
"learning_rate": 7.383967612956988e-05, |
|
"loss": 2.0914, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 0.7517587939698492, |
|
"grad_norm": 1.6682387590408325, |
|
"learning_rate": 7.369343312364993e-05, |
|
"loss": 2.2405, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 0.7537688442211056, |
|
"grad_norm": 1.634537935256958, |
|
"learning_rate": 7.354692826101102e-05, |
|
"loss": 1.9596, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.7557788944723618, |
|
"grad_norm": 1.6119855642318726, |
|
"learning_rate": 7.340016316080565e-05, |
|
"loss": 2.1663, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 0.7577889447236181, |
|
"grad_norm": 1.621778130531311, |
|
"learning_rate": 7.325313944506254e-05, |
|
"loss": 1.9821, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 0.7597989949748744, |
|
"grad_norm": 2.1226937770843506, |
|
"learning_rate": 7.310585873866848e-05, |
|
"loss": 2.2974, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 0.7618090452261307, |
|
"grad_norm": 1.5477526187896729, |
|
"learning_rate": 7.295832266935059e-05, |
|
"loss": 1.8883, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 0.7638190954773869, |
|
"grad_norm": 1.5489342212677002, |
|
"learning_rate": 7.281053286765815e-05, |
|
"loss": 2.0525, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.7658291457286432, |
|
"grad_norm": 2.4198858737945557, |
|
"learning_rate": 7.26624909669447e-05, |
|
"loss": 2.0754, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 0.7678391959798995, |
|
"grad_norm": 1.5412720441818237, |
|
"learning_rate": 7.251419860334994e-05, |
|
"loss": 1.8809, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 0.7698492462311558, |
|
"grad_norm": 1.656874179840088, |
|
"learning_rate": 7.236565741578163e-05, |
|
"loss": 2.0115, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 0.771859296482412, |
|
"grad_norm": 1.5036085844039917, |
|
"learning_rate": 7.221686904589754e-05, |
|
"loss": 2.2604, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 0.7738693467336684, |
|
"grad_norm": 1.4638508558273315, |
|
"learning_rate": 7.20678351380872e-05, |
|
"loss": 1.9812, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.7758793969849246, |
|
"grad_norm": 1.5553574562072754, |
|
"learning_rate": 7.191855733945387e-05, |
|
"loss": 2.0646, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 0.7778894472361809, |
|
"grad_norm": 1.5192738771438599, |
|
"learning_rate": 7.176903729979621e-05, |
|
"loss": 2.3253, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 0.7798994974874371, |
|
"grad_norm": 1.702390432357788, |
|
"learning_rate": 7.161927667159013e-05, |
|
"loss": 2.0733, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 0.7819095477386935, |
|
"grad_norm": 1.551355004310608, |
|
"learning_rate": 7.146927710997047e-05, |
|
"loss": 2.0301, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 0.7839195979899497, |
|
"grad_norm": 1.6954432725906372, |
|
"learning_rate": 7.13190402727127e-05, |
|
"loss": 1.9302, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.785929648241206, |
|
"grad_norm": 1.5084172487258911, |
|
"learning_rate": 7.116856782021468e-05, |
|
"loss": 2.4387, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 0.7879396984924623, |
|
"grad_norm": 1.5950528383255005, |
|
"learning_rate": 7.101786141547828e-05, |
|
"loss": 2.1753, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 0.7899497487437186, |
|
"grad_norm": 1.674777865409851, |
|
"learning_rate": 7.08669227240909e-05, |
|
"loss": 2.0804, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 0.7919597989949749, |
|
"grad_norm": 1.5074516534805298, |
|
"learning_rate": 7.071575341420719e-05, |
|
"loss": 2.2549, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 0.7939698492462312, |
|
"grad_norm": 1.6578811407089233, |
|
"learning_rate": 7.056435515653059e-05, |
|
"loss": 2.1851, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.7959798994974875, |
|
"grad_norm": 1.6693928241729736, |
|
"learning_rate": 7.041272962429477e-05, |
|
"loss": 2.2608, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 0.7979899497487437, |
|
"grad_norm": 1.6492758989334106, |
|
"learning_rate": 7.026087849324527e-05, |
|
"loss": 1.9814, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.59951651096344, |
|
"learning_rate": 7.010880344162088e-05, |
|
"loss": 2.0959, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 0.8020100502512563, |
|
"grad_norm": 1.6797847747802734, |
|
"learning_rate": 6.995650615013516e-05, |
|
"loss": 2.242, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 0.8040201005025126, |
|
"grad_norm": 1.566300868988037, |
|
"learning_rate": 6.980398830195785e-05, |
|
"loss": 2.2452, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.8060301507537688, |
|
"grad_norm": 1.471531629562378, |
|
"learning_rate": 6.965125158269619e-05, |
|
"loss": 1.7636, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 0.8080402010050252, |
|
"grad_norm": 1.5839449167251587, |
|
"learning_rate": 6.94982976803764e-05, |
|
"loss": 2.299, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 0.8100502512562814, |
|
"grad_norm": 1.5502268075942993, |
|
"learning_rate": 6.934512828542497e-05, |
|
"loss": 1.8683, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 0.8120603015075377, |
|
"grad_norm": 1.532381296157837, |
|
"learning_rate": 6.919174509065004e-05, |
|
"loss": 1.8511, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 0.8140703517587939, |
|
"grad_norm": 1.4679884910583496, |
|
"learning_rate": 6.903814979122249e-05, |
|
"loss": 2.1163, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.8160804020100503, |
|
"grad_norm": 1.5164811611175537, |
|
"learning_rate": 6.888434408465751e-05, |
|
"loss": 2.1253, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 0.8180904522613065, |
|
"grad_norm": 1.6110063791275024, |
|
"learning_rate": 6.873032967079561e-05, |
|
"loss": 2.2175, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 0.8201005025125628, |
|
"grad_norm": 1.4853484630584717, |
|
"learning_rate": 6.85761082517839e-05, |
|
"loss": 2.1247, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 0.8221105527638191, |
|
"grad_norm": 1.7195366621017456, |
|
"learning_rate": 6.842168153205734e-05, |
|
"loss": 2.2131, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 0.8241206030150754, |
|
"grad_norm": 1.622910976409912, |
|
"learning_rate": 6.826705121831976e-05, |
|
"loss": 2.0826, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.8261306532663316, |
|
"grad_norm": 1.5393376350402832, |
|
"learning_rate": 6.811221901952513e-05, |
|
"loss": 1.9764, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 0.828140703517588, |
|
"grad_norm": 1.475002408027649, |
|
"learning_rate": 6.795718664685868e-05, |
|
"loss": 1.9606, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 0.8301507537688442, |
|
"grad_norm": 1.607079267501831, |
|
"learning_rate": 6.780195581371784e-05, |
|
"loss": 2.3341, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 0.8321608040201005, |
|
"grad_norm": 1.5544415712356567, |
|
"learning_rate": 6.764652823569344e-05, |
|
"loss": 2.3198, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 0.8341708542713567, |
|
"grad_norm": 1.5592552423477173, |
|
"learning_rate": 6.749090563055076e-05, |
|
"loss": 1.8339, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.8361809045226131, |
|
"grad_norm": 1.6058601140975952, |
|
"learning_rate": 6.733508971821036e-05, |
|
"loss": 2.0026, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 0.8381909547738694, |
|
"grad_norm": 1.5740671157836914, |
|
"learning_rate": 6.717908222072935e-05, |
|
"loss": 2.1512, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 0.8402010050251256, |
|
"grad_norm": 1.4987316131591797, |
|
"learning_rate": 6.702288486228216e-05, |
|
"loss": 2.1372, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 0.842211055276382, |
|
"grad_norm": 1.501441478729248, |
|
"learning_rate": 6.686649936914152e-05, |
|
"loss": 2.1216, |
|
"step": 838 |
|
}, |
|
{ |
|
"epoch": 0.8442211055276382, |
|
"grad_norm": 1.522459626197815, |
|
"learning_rate": 6.670992746965938e-05, |
|
"loss": 2.2921, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.8462311557788945, |
|
"grad_norm": 1.5145554542541504, |
|
"learning_rate": 6.65531708942479e-05, |
|
"loss": 2.0768, |
|
"step": 842 |
|
}, |
|
{ |
|
"epoch": 0.8482412060301507, |
|
"grad_norm": 1.7844620943069458, |
|
"learning_rate": 6.639623137536023e-05, |
|
"loss": 2.0404, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 0.8502512562814071, |
|
"grad_norm": 1.5324727296829224, |
|
"learning_rate": 6.623911064747133e-05, |
|
"loss": 1.9641, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 0.8522613065326633, |
|
"grad_norm": 1.5896046161651611, |
|
"learning_rate": 6.608181044705892e-05, |
|
"loss": 1.922, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 0.8542713567839196, |
|
"grad_norm": 1.531399130821228, |
|
"learning_rate": 6.592433251258423e-05, |
|
"loss": 1.9917, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.8562814070351759, |
|
"grad_norm": 1.3786967992782593, |
|
"learning_rate": 6.576667858447272e-05, |
|
"loss": 2.0952, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 0.8582914572864322, |
|
"grad_norm": 1.5708434581756592, |
|
"learning_rate": 6.560885040509499e-05, |
|
"loss": 2.0783, |
|
"step": 854 |
|
}, |
|
{ |
|
"epoch": 0.8603015075376884, |
|
"grad_norm": 1.5222376585006714, |
|
"learning_rate": 6.545084971874738e-05, |
|
"loss": 1.9323, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 0.8623115577889447, |
|
"grad_norm": 1.591243028640747, |
|
"learning_rate": 6.529267827163277e-05, |
|
"loss": 2.0764, |
|
"step": 858 |
|
}, |
|
{ |
|
"epoch": 0.864321608040201, |
|
"grad_norm": 1.5842283964157104, |
|
"learning_rate": 6.51343378118413e-05, |
|
"loss": 2.183, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.8663316582914573, |
|
"grad_norm": 1.5385973453521729, |
|
"learning_rate": 6.497583008933097e-05, |
|
"loss": 2.2194, |
|
"step": 862 |
|
}, |
|
{ |
|
"epoch": 0.8683417085427135, |
|
"grad_norm": 1.5147958993911743, |
|
"learning_rate": 6.481715685590836e-05, |
|
"loss": 1.9953, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 0.8703517587939699, |
|
"grad_norm": 1.5883772373199463, |
|
"learning_rate": 6.465831986520927e-05, |
|
"loss": 2.0174, |
|
"step": 866 |
|
}, |
|
{ |
|
"epoch": 0.8723618090452261, |
|
"grad_norm": 1.5510574579238892, |
|
"learning_rate": 6.449932087267932e-05, |
|
"loss": 1.9516, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 0.8743718592964824, |
|
"grad_norm": 1.4354772567749023, |
|
"learning_rate": 6.434016163555452e-05, |
|
"loss": 1.9101, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.8763819095477386, |
|
"grad_norm": 1.5250461101531982, |
|
"learning_rate": 6.418084391284192e-05, |
|
"loss": 2.1036, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 0.878391959798995, |
|
"grad_norm": 1.6079366207122803, |
|
"learning_rate": 6.402136946530014e-05, |
|
"loss": 2.1401, |
|
"step": 874 |
|
}, |
|
{ |
|
"epoch": 0.8804020100502512, |
|
"grad_norm": 1.5236170291900635, |
|
"learning_rate": 6.386174005541986e-05, |
|
"loss": 1.9832, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 0.8824120603015075, |
|
"grad_norm": 1.5333629846572876, |
|
"learning_rate": 6.370195744740442e-05, |
|
"loss": 1.8619, |
|
"step": 878 |
|
}, |
|
{ |
|
"epoch": 0.8844221105527639, |
|
"grad_norm": 1.5753098726272583, |
|
"learning_rate": 6.354202340715026e-05, |
|
"loss": 1.8309, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.8864321608040201, |
|
"grad_norm": 1.5884308815002441, |
|
"learning_rate": 6.338193970222744e-05, |
|
"loss": 2.1437, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 0.8884422110552764, |
|
"grad_norm": 1.5323386192321777, |
|
"learning_rate": 6.322170810186012e-05, |
|
"loss": 2.1074, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 0.8904522613065327, |
|
"grad_norm": 1.4975218772888184, |
|
"learning_rate": 6.306133037690693e-05, |
|
"loss": 2.0906, |
|
"step": 886 |
|
}, |
|
{ |
|
"epoch": 0.892462311557789, |
|
"grad_norm": 1.5518718957901, |
|
"learning_rate": 6.290080829984147e-05, |
|
"loss": 1.9955, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 0.8944723618090452, |
|
"grad_norm": 1.5773143768310547, |
|
"learning_rate": 6.274014364473274e-05, |
|
"loss": 2.2483, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.8964824120603015, |
|
"grad_norm": 1.533818006515503, |
|
"learning_rate": 6.257933818722543e-05, |
|
"loss": 2.093, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 0.8984924623115578, |
|
"grad_norm": 1.442911148071289, |
|
"learning_rate": 6.241839370452041e-05, |
|
"loss": 2.1291, |
|
"step": 894 |
|
}, |
|
{ |
|
"epoch": 0.9005025125628141, |
|
"grad_norm": 1.498679518699646, |
|
"learning_rate": 6.2257311975355e-05, |
|
"loss": 1.9726, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 0.9025125628140703, |
|
"grad_norm": 1.647017002105713, |
|
"learning_rate": 6.209609477998338e-05, |
|
"loss": 1.9182, |
|
"step": 898 |
|
}, |
|
{ |
|
"epoch": 0.9045226130653267, |
|
"grad_norm": 1.546736240386963, |
|
"learning_rate": 6.19347439001569e-05, |
|
"loss": 2.0869, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.9065326633165829, |
|
"grad_norm": 1.4182671308517456, |
|
"learning_rate": 6.177326111910429e-05, |
|
"loss": 2.1178, |
|
"step": 902 |
|
}, |
|
{ |
|
"epoch": 0.9085427135678392, |
|
"grad_norm": 1.685409426689148, |
|
"learning_rate": 6.161164822151213e-05, |
|
"loss": 2.1423, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 0.9105527638190954, |
|
"grad_norm": 1.488713026046753, |
|
"learning_rate": 6.144990699350497e-05, |
|
"loss": 2.2579, |
|
"step": 906 |
|
}, |
|
{ |
|
"epoch": 0.9125628140703518, |
|
"grad_norm": 1.6088262796401978, |
|
"learning_rate": 6.128803922262573e-05, |
|
"loss": 2.1192, |
|
"step": 908 |
|
}, |
|
{ |
|
"epoch": 0.914572864321608, |
|
"grad_norm": 1.6858761310577393, |
|
"learning_rate": 6.112604669781572e-05, |
|
"loss": 1.9913, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.9165829145728643, |
|
"grad_norm": 1.4918874502182007, |
|
"learning_rate": 6.096393120939516e-05, |
|
"loss": 2.0041, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 0.9185929648241206, |
|
"grad_norm": 1.501802921295166, |
|
"learning_rate": 6.08016945490432e-05, |
|
"loss": 2.1627, |
|
"step": 914 |
|
}, |
|
{ |
|
"epoch": 0.9206030150753769, |
|
"grad_norm": 1.6295005083084106, |
|
"learning_rate": 6.063933850977811e-05, |
|
"loss": 2.281, |
|
"step": 916 |
|
}, |
|
{ |
|
"epoch": 0.9226130653266331, |
|
"grad_norm": 1.4057841300964355, |
|
"learning_rate": 6.04768648859376e-05, |
|
"loss": 2.0616, |
|
"step": 918 |
|
}, |
|
{ |
|
"epoch": 0.9246231155778895, |
|
"grad_norm": 1.386192798614502, |
|
"learning_rate": 6.031427547315889e-05, |
|
"loss": 1.9191, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.9266331658291457, |
|
"grad_norm": 1.4363282918930054, |
|
"learning_rate": 6.015157206835881e-05, |
|
"loss": 1.94, |
|
"step": 922 |
|
}, |
|
{ |
|
"epoch": 0.928643216080402, |
|
"grad_norm": 1.6516571044921875, |
|
"learning_rate": 5.9988756469714135e-05, |
|
"loss": 2.3657, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 0.9306532663316583, |
|
"grad_norm": 1.465562105178833, |
|
"learning_rate": 5.982583047664151e-05, |
|
"loss": 2.1271, |
|
"step": 926 |
|
}, |
|
{ |
|
"epoch": 0.9326633165829146, |
|
"grad_norm": 1.5715078115463257, |
|
"learning_rate": 5.9662795889777666e-05, |
|
"loss": 2.1258, |
|
"step": 928 |
|
}, |
|
{ |
|
"epoch": 0.9346733668341709, |
|
"grad_norm": 3.0183351039886475, |
|
"learning_rate": 5.949965451095951e-05, |
|
"loss": 2.2448, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.9366834170854271, |
|
"grad_norm": 1.5689525604248047, |
|
"learning_rate": 5.933640814320417e-05, |
|
"loss": 2.161, |
|
"step": 932 |
|
}, |
|
{ |
|
"epoch": 0.9386934673366835, |
|
"grad_norm": 1.5857359170913696, |
|
"learning_rate": 5.917305859068912e-05, |
|
"loss": 2.2529, |
|
"step": 934 |
|
}, |
|
{ |
|
"epoch": 0.9407035175879397, |
|
"grad_norm": 1.4421300888061523, |
|
"learning_rate": 5.900960765873222e-05, |
|
"loss": 2.1033, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 0.942713567839196, |
|
"grad_norm": 1.5819547176361084, |
|
"learning_rate": 5.8846057153771786e-05, |
|
"loss": 2.2916, |
|
"step": 938 |
|
}, |
|
{ |
|
"epoch": 0.9447236180904522, |
|
"grad_norm": 1.4737604856491089, |
|
"learning_rate": 5.868240888334653e-05, |
|
"loss": 2.005, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.9467336683417086, |
|
"grad_norm": 1.4487162828445435, |
|
"learning_rate": 5.8518664656075706e-05, |
|
"loss": 1.9979, |
|
"step": 942 |
|
}, |
|
{ |
|
"epoch": 0.9487437185929648, |
|
"grad_norm": 1.5933785438537598, |
|
"learning_rate": 5.835482628163909e-05, |
|
"loss": 1.7228, |
|
"step": 944 |
|
}, |
|
{ |
|
"epoch": 0.9507537688442211, |
|
"grad_norm": 1.6304993629455566, |
|
"learning_rate": 5.819089557075689e-05, |
|
"loss": 2.109, |
|
"step": 946 |
|
}, |
|
{ |
|
"epoch": 0.9527638190954774, |
|
"grad_norm": 1.5096608400344849, |
|
"learning_rate": 5.802687433516989e-05, |
|
"loss": 1.9898, |
|
"step": 948 |
|
}, |
|
{ |
|
"epoch": 0.9547738693467337, |
|
"grad_norm": 1.611997127532959, |
|
"learning_rate": 5.786276438761927e-05, |
|
"loss": 1.9248, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.9567839195979899, |
|
"grad_norm": 1.6030073165893555, |
|
"learning_rate": 5.7698567541826675e-05, |
|
"loss": 2.0681, |
|
"step": 952 |
|
}, |
|
{ |
|
"epoch": 0.9587939698492463, |
|
"grad_norm": 1.569161057472229, |
|
"learning_rate": 5.753428561247416e-05, |
|
"loss": 1.919, |
|
"step": 954 |
|
}, |
|
{ |
|
"epoch": 0.9608040201005025, |
|
"grad_norm": 1.5151729583740234, |
|
"learning_rate": 5.7369920415184064e-05, |
|
"loss": 2.1896, |
|
"step": 956 |
|
}, |
|
{ |
|
"epoch": 0.9628140703517588, |
|
"grad_norm": 1.6666862964630127, |
|
"learning_rate": 5.7205473766499005e-05, |
|
"loss": 2.0416, |
|
"step": 958 |
|
}, |
|
{ |
|
"epoch": 0.964824120603015, |
|
"grad_norm": 1.5797557830810547, |
|
"learning_rate": 5.704094748386184e-05, |
|
"loss": 1.9272, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.9668341708542714, |
|
"grad_norm": 1.508834719657898, |
|
"learning_rate": 5.6876343385595446e-05, |
|
"loss": 1.7583, |
|
"step": 962 |
|
}, |
|
{ |
|
"epoch": 0.9688442211055276, |
|
"grad_norm": 1.4971197843551636, |
|
"learning_rate": 5.6711663290882776e-05, |
|
"loss": 1.9919, |
|
"step": 964 |
|
}, |
|
{ |
|
"epoch": 0.9708542713567839, |
|
"grad_norm": 1.6074587106704712, |
|
"learning_rate": 5.6546909019746666e-05, |
|
"loss": 1.982, |
|
"step": 966 |
|
}, |
|
{ |
|
"epoch": 0.9728643216080402, |
|
"grad_norm": 3.3276638984680176, |
|
"learning_rate": 5.6382082393029746e-05, |
|
"loss": 2.0718, |
|
"step": 968 |
|
}, |
|
{ |
|
"epoch": 0.9748743718592965, |
|
"grad_norm": 1.4836559295654297, |
|
"learning_rate": 5.621718523237427e-05, |
|
"loss": 2.196, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.9768844221105528, |
|
"grad_norm": 1.502540946006775, |
|
"learning_rate": 5.605221936020207e-05, |
|
"loss": 2.1674, |
|
"step": 972 |
|
}, |
|
{ |
|
"epoch": 0.978894472361809, |
|
"grad_norm": 1.4632574319839478, |
|
"learning_rate": 5.588718659969438e-05, |
|
"loss": 2.2086, |
|
"step": 974 |
|
}, |
|
{ |
|
"epoch": 0.9809045226130654, |
|
"grad_norm": 1.5661799907684326, |
|
"learning_rate": 5.57220887747716e-05, |
|
"loss": 2.0454, |
|
"step": 976 |
|
}, |
|
{ |
|
"epoch": 0.9829145728643216, |
|
"grad_norm": 1.5829559564590454, |
|
"learning_rate": 5.5556927710073314e-05, |
|
"loss": 1.9525, |
|
"step": 978 |
|
}, |
|
{ |
|
"epoch": 0.9849246231155779, |
|
"grad_norm": 1.6690107583999634, |
|
"learning_rate": 5.539170523093794e-05, |
|
"loss": 2.0355, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.9869346733668342, |
|
"grad_norm": 1.524163842201233, |
|
"learning_rate": 5.522642316338268e-05, |
|
"loss": 2.0668, |
|
"step": 982 |
|
}, |
|
{ |
|
"epoch": 0.9889447236180905, |
|
"grad_norm": 1.5150009393692017, |
|
"learning_rate": 5.506108333408329e-05, |
|
"loss": 2.1167, |
|
"step": 984 |
|
}, |
|
{ |
|
"epoch": 0.9909547738693467, |
|
"grad_norm": 1.5068144798278809, |
|
"learning_rate": 5.489568757035391e-05, |
|
"loss": 2.0714, |
|
"step": 986 |
|
}, |
|
{ |
|
"epoch": 0.992964824120603, |
|
"grad_norm": 1.5889198780059814, |
|
"learning_rate": 5.473023770012686e-05, |
|
"loss": 2.0012, |
|
"step": 988 |
|
}, |
|
{ |
|
"epoch": 0.9949748743718593, |
|
"grad_norm": 1.4362760782241821, |
|
"learning_rate": 5.456473555193242e-05, |
|
"loss": 1.9559, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.9969849246231156, |
|
"grad_norm": 1.4204431772232056, |
|
"learning_rate": 5.4399182954878656e-05, |
|
"loss": 1.8115, |
|
"step": 992 |
|
}, |
|
{ |
|
"epoch": 0.9989949748743718, |
|
"grad_norm": 1.500132441520691, |
|
"learning_rate": 5.4233581738631165e-05, |
|
"loss": 2.2726, |
|
"step": 994 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 1990, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 995, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.6840475756671795e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|