|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 382, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.002617801047120419, |
|
"grad_norm": 4.880457401275635, |
|
"learning_rate": 2.564102564102564e-06, |
|
"loss": 6.8614, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.005235602094240838, |
|
"grad_norm": 5.4761762619018555, |
|
"learning_rate": 5.128205128205128e-06, |
|
"loss": 7.3812, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.007853403141361256, |
|
"grad_norm": 4.928334712982178, |
|
"learning_rate": 7.692307692307694e-06, |
|
"loss": 7.2286, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.010471204188481676, |
|
"grad_norm": 5.0216498374938965, |
|
"learning_rate": 1.0256410256410256e-05, |
|
"loss": 7.3511, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.013089005235602094, |
|
"grad_norm": 4.557470798492432, |
|
"learning_rate": 1.282051282051282e-05, |
|
"loss": 6.6464, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.015706806282722512, |
|
"grad_norm": 4.309239864349365, |
|
"learning_rate": 1.5384615384615387e-05, |
|
"loss": 6.9958, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.01832460732984293, |
|
"grad_norm": 4.001352787017822, |
|
"learning_rate": 1.794871794871795e-05, |
|
"loss": 6.5667, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.020942408376963352, |
|
"grad_norm": 3.922077178955078, |
|
"learning_rate": 2.0512820512820512e-05, |
|
"loss": 6.491, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.02356020942408377, |
|
"grad_norm": 3.411221742630005, |
|
"learning_rate": 2.307692307692308e-05, |
|
"loss": 6.0273, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.02617801047120419, |
|
"grad_norm": 3.9540348052978516, |
|
"learning_rate": 2.564102564102564e-05, |
|
"loss": 6.6163, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.028795811518324606, |
|
"grad_norm": 3.2421910762786865, |
|
"learning_rate": 2.8205128205128207e-05, |
|
"loss": 6.2189, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.031413612565445025, |
|
"grad_norm": 3.0702974796295166, |
|
"learning_rate": 3.0769230769230774e-05, |
|
"loss": 5.5681, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.034031413612565446, |
|
"grad_norm": 4.09296178817749, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 6.302, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.03664921465968586, |
|
"grad_norm": 2.9595351219177246, |
|
"learning_rate": 3.58974358974359e-05, |
|
"loss": 5.4689, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.03926701570680628, |
|
"grad_norm": 3.2727208137512207, |
|
"learning_rate": 3.846153846153846e-05, |
|
"loss": 5.6809, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.041884816753926704, |
|
"grad_norm": 3.616870880126953, |
|
"learning_rate": 4.1025641025641023e-05, |
|
"loss": 5.7648, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.04450261780104712, |
|
"grad_norm": 3.6780197620391846, |
|
"learning_rate": 4.358974358974359e-05, |
|
"loss": 5.4097, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.04712041884816754, |
|
"grad_norm": 3.512361526489258, |
|
"learning_rate": 4.615384615384616e-05, |
|
"loss": 5.7418, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.049738219895287955, |
|
"grad_norm": 4.385097503662109, |
|
"learning_rate": 4.871794871794872e-05, |
|
"loss": 5.4156, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.05235602094240838, |
|
"grad_norm": 4.452427864074707, |
|
"learning_rate": 5.128205128205128e-05, |
|
"loss": 5.3937, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0549738219895288, |
|
"grad_norm": 3.246995210647583, |
|
"learning_rate": 5.384615384615385e-05, |
|
"loss": 5.0434, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.05759162303664921, |
|
"grad_norm": 3.3380537033081055, |
|
"learning_rate": 5.6410256410256414e-05, |
|
"loss": 4.999, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.060209424083769635, |
|
"grad_norm": 3.313646078109741, |
|
"learning_rate": 5.897435897435898e-05, |
|
"loss": 5.1477, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.06282722513089005, |
|
"grad_norm": 2.918846368789673, |
|
"learning_rate": 6.153846153846155e-05, |
|
"loss": 4.6583, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.06544502617801047, |
|
"grad_norm": 3.070826768875122, |
|
"learning_rate": 6.410256410256412e-05, |
|
"loss": 4.5479, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.06806282722513089, |
|
"grad_norm": 3.04087233543396, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 4.592, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.07068062827225131, |
|
"grad_norm": 2.8428642749786377, |
|
"learning_rate": 6.923076923076924e-05, |
|
"loss": 4.3682, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.07329842931937172, |
|
"grad_norm": 3.0716347694396973, |
|
"learning_rate": 7.17948717948718e-05, |
|
"loss": 4.5837, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.07591623036649214, |
|
"grad_norm": 2.571244478225708, |
|
"learning_rate": 7.435897435897436e-05, |
|
"loss": 3.992, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.07853403141361257, |
|
"grad_norm": 3.7117347717285156, |
|
"learning_rate": 7.692307692307693e-05, |
|
"loss": 4.5136, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.08115183246073299, |
|
"grad_norm": 2.827247381210327, |
|
"learning_rate": 7.948717948717948e-05, |
|
"loss": 4.4797, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.08376963350785341, |
|
"grad_norm": 2.7113707065582275, |
|
"learning_rate": 8.205128205128205e-05, |
|
"loss": 4.1275, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.08638743455497382, |
|
"grad_norm": 2.837117910385132, |
|
"learning_rate": 8.461538461538461e-05, |
|
"loss": 4.3012, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.08900523560209424, |
|
"grad_norm": 3.1402807235717773, |
|
"learning_rate": 8.717948717948718e-05, |
|
"loss": 4.3305, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.09162303664921466, |
|
"grad_norm": 2.8632307052612305, |
|
"learning_rate": 8.974358974358975e-05, |
|
"loss": 4.0145, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.09424083769633508, |
|
"grad_norm": 2.7232565879821777, |
|
"learning_rate": 9.230769230769232e-05, |
|
"loss": 4.2267, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.0968586387434555, |
|
"grad_norm": 2.762054443359375, |
|
"learning_rate": 9.487179487179487e-05, |
|
"loss": 4.0502, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.09947643979057591, |
|
"grad_norm": 18.91065788269043, |
|
"learning_rate": 9.743589743589744e-05, |
|
"loss": 4.1297, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.10209424083769633, |
|
"grad_norm": 3.6653544902801514, |
|
"learning_rate": 0.0001, |
|
"loss": 4.2813, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.10471204188481675, |
|
"grad_norm": 2.7435433864593506, |
|
"learning_rate": 9.999953057840867e-05, |
|
"loss": 3.7026, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.10732984293193717, |
|
"grad_norm": 3.3734006881713867, |
|
"learning_rate": 9.999812232244895e-05, |
|
"loss": 4.1651, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.1099476439790576, |
|
"grad_norm": 2.5766139030456543, |
|
"learning_rate": 9.999577525856345e-05, |
|
"loss": 4.0622, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.112565445026178, |
|
"grad_norm": 3.926454544067383, |
|
"learning_rate": 9.99924894308227e-05, |
|
"loss": 4.2311, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.11518324607329843, |
|
"grad_norm": 3.0207953453063965, |
|
"learning_rate": 9.998826490092421e-05, |
|
"loss": 3.7921, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.11780104712041885, |
|
"grad_norm": 2.495757579803467, |
|
"learning_rate": 9.998310174819142e-05, |
|
"loss": 3.7601, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.12041884816753927, |
|
"grad_norm": 2.8352251052856445, |
|
"learning_rate": 9.997700006957214e-05, |
|
"loss": 3.6366, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.12303664921465969, |
|
"grad_norm": 2.970708131790161, |
|
"learning_rate": 9.996995997963675e-05, |
|
"loss": 3.9135, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.1256544502617801, |
|
"grad_norm": 2.4032399654388428, |
|
"learning_rate": 9.996198161057607e-05, |
|
"loss": 3.8009, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.12827225130890052, |
|
"grad_norm": 3.217522144317627, |
|
"learning_rate": 9.995306511219885e-05, |
|
"loss": 3.9169, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.13089005235602094, |
|
"grad_norm": 2.8503968715667725, |
|
"learning_rate": 9.994321065192894e-05, |
|
"loss": 3.9316, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.13350785340314136, |
|
"grad_norm": 2.826801061630249, |
|
"learning_rate": 9.993241841480223e-05, |
|
"loss": 3.9417, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.13612565445026178, |
|
"grad_norm": 2.5175540447235107, |
|
"learning_rate": 9.992068860346306e-05, |
|
"loss": 3.5643, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.1387434554973822, |
|
"grad_norm": 2.7539591789245605, |
|
"learning_rate": 9.990802143816051e-05, |
|
"loss": 3.5607, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.14136125654450263, |
|
"grad_norm": 2.4108636379241943, |
|
"learning_rate": 9.989441715674422e-05, |
|
"loss": 3.4459, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.14397905759162305, |
|
"grad_norm": 3.2774994373321533, |
|
"learning_rate": 9.987987601465991e-05, |
|
"loss": 3.9545, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.14659685863874344, |
|
"grad_norm": 2.3104467391967773, |
|
"learning_rate": 9.986439828494465e-05, |
|
"loss": 3.6954, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.14921465968586387, |
|
"grad_norm": 2.5438120365142822, |
|
"learning_rate": 9.984798425822163e-05, |
|
"loss": 3.5669, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.1518324607329843, |
|
"grad_norm": 2.4583828449249268, |
|
"learning_rate": 9.98306342426948e-05, |
|
"loss": 3.8843, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.1544502617801047, |
|
"grad_norm": 2.34213924407959, |
|
"learning_rate": 9.981234856414307e-05, |
|
"loss": 3.7361, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.15706806282722513, |
|
"grad_norm": 2.6676721572875977, |
|
"learning_rate": 9.979312756591407e-05, |
|
"loss": 3.7573, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.15968586387434555, |
|
"grad_norm": 2.4997060298919678, |
|
"learning_rate": 9.977297160891792e-05, |
|
"loss": 3.7876, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.16230366492146597, |
|
"grad_norm": 2.709794044494629, |
|
"learning_rate": 9.975188107162026e-05, |
|
"loss": 3.614, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.1649214659685864, |
|
"grad_norm": 2.355778932571411, |
|
"learning_rate": 9.972985635003522e-05, |
|
"loss": 3.4262, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.16753926701570682, |
|
"grad_norm": 2.5319371223449707, |
|
"learning_rate": 9.970689785771798e-05, |
|
"loss": 3.5963, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.17015706806282724, |
|
"grad_norm": 2.559286117553711, |
|
"learning_rate": 9.968300602575707e-05, |
|
"loss": 3.6722, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.17277486910994763, |
|
"grad_norm": 2.4441914558410645, |
|
"learning_rate": 9.965818130276612e-05, |
|
"loss": 3.4972, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.17539267015706805, |
|
"grad_norm": 2.2148141860961914, |
|
"learning_rate": 9.963242415487557e-05, |
|
"loss": 3.6571, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.17801047120418848, |
|
"grad_norm": 2.3289103507995605, |
|
"learning_rate": 9.96057350657239e-05, |
|
"loss": 3.6085, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.1806282722513089, |
|
"grad_norm": 2.6062214374542236, |
|
"learning_rate": 9.957811453644847e-05, |
|
"loss": 3.4849, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.18324607329842932, |
|
"grad_norm": 2.347382068634033, |
|
"learning_rate": 9.954956308567622e-05, |
|
"loss": 3.3534, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.18586387434554974, |
|
"grad_norm": 4.254333972930908, |
|
"learning_rate": 9.952008124951381e-05, |
|
"loss": 3.6956, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.18848167539267016, |
|
"grad_norm": 2.4889872074127197, |
|
"learning_rate": 9.948966958153771e-05, |
|
"loss": 3.5671, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.19109947643979058, |
|
"grad_norm": 34.48236083984375, |
|
"learning_rate": 9.945832865278363e-05, |
|
"loss": 3.5257, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.193717277486911, |
|
"grad_norm": 2.4782514572143555, |
|
"learning_rate": 9.942605905173592e-05, |
|
"loss": 3.5874, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.19633507853403143, |
|
"grad_norm": 2.386019706726074, |
|
"learning_rate": 9.939286138431647e-05, |
|
"loss": 3.5946, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.19895287958115182, |
|
"grad_norm": 2.1345767974853516, |
|
"learning_rate": 9.935873627387336e-05, |
|
"loss": 3.3744, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.20157068062827224, |
|
"grad_norm": 2.562124013900757, |
|
"learning_rate": 9.932368436116915e-05, |
|
"loss": 3.4642, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.20418848167539266, |
|
"grad_norm": 2.3154282569885254, |
|
"learning_rate": 9.92877063043688e-05, |
|
"loss": 3.4095, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.20680628272251309, |
|
"grad_norm": 2.225123643875122, |
|
"learning_rate": 9.925080277902743e-05, |
|
"loss": 3.496, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.2094240837696335, |
|
"grad_norm": 2.0953874588012695, |
|
"learning_rate": 9.921297447807744e-05, |
|
"loss": 3.4574, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.21204188481675393, |
|
"grad_norm": 2.2577614784240723, |
|
"learning_rate": 9.917422211181571e-05, |
|
"loss": 3.5562, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.21465968586387435, |
|
"grad_norm": 2.2811625003814697, |
|
"learning_rate": 9.913454640789013e-05, |
|
"loss": 3.3894, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.21727748691099477, |
|
"grad_norm": 2.139005661010742, |
|
"learning_rate": 9.909394811128598e-05, |
|
"loss": 3.3137, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.2198952879581152, |
|
"grad_norm": 2.058781147003174, |
|
"learning_rate": 9.905242798431196e-05, |
|
"loss": 3.4472, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.22251308900523561, |
|
"grad_norm": 14.535298347473145, |
|
"learning_rate": 9.900998680658581e-05, |
|
"loss": 3.3115, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.225130890052356, |
|
"grad_norm": 2.935969114303589, |
|
"learning_rate": 9.896662537501976e-05, |
|
"loss": 3.3982, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.22774869109947643, |
|
"grad_norm": 2.613450288772583, |
|
"learning_rate": 9.892234450380547e-05, |
|
"loss": 3.3868, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.23036649214659685, |
|
"grad_norm": 2.750882387161255, |
|
"learning_rate": 9.887714502439884e-05, |
|
"loss": 3.2625, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.23298429319371727, |
|
"grad_norm": 2.1589114665985107, |
|
"learning_rate": 9.883102778550434e-05, |
|
"loss": 3.1647, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.2356020942408377, |
|
"grad_norm": 2.105313777923584, |
|
"learning_rate": 9.878399365305906e-05, |
|
"loss": 3.1404, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.23821989528795812, |
|
"grad_norm": 3.4829330444335938, |
|
"learning_rate": 9.873604351021648e-05, |
|
"loss": 3.3671, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.24083769633507854, |
|
"grad_norm": 3.3065099716186523, |
|
"learning_rate": 9.868717825732994e-05, |
|
"loss": 3.4199, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.24345549738219896, |
|
"grad_norm": 5.105870723724365, |
|
"learning_rate": 9.863739881193558e-05, |
|
"loss": 3.4172, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.24607329842931938, |
|
"grad_norm": 3.03316593170166, |
|
"learning_rate": 9.858670610873528e-05, |
|
"loss": 3.2284, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.2486910994764398, |
|
"grad_norm": 2.4118738174438477, |
|
"learning_rate": 9.853510109957903e-05, |
|
"loss": 3.4009, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.2513089005235602, |
|
"grad_norm": 5.970595359802246, |
|
"learning_rate": 9.848258475344702e-05, |
|
"loss": 3.387, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.25392670157068065, |
|
"grad_norm": 2.43656325340271, |
|
"learning_rate": 9.842915805643155e-05, |
|
"loss": 3.3451, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.25654450261780104, |
|
"grad_norm": 3.862488031387329, |
|
"learning_rate": 9.837482201171842e-05, |
|
"loss": 3.2579, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.2591623036649215, |
|
"grad_norm": 2.120502471923828, |
|
"learning_rate": 9.831957763956813e-05, |
|
"loss": 3.3365, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.2617801047120419, |
|
"grad_norm": 2.438363552093506, |
|
"learning_rate": 9.826342597729672e-05, |
|
"loss": 3.1251, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2643979057591623, |
|
"grad_norm": 2.0495247840881348, |
|
"learning_rate": 9.820636807925628e-05, |
|
"loss": 3.3333, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.2670157068062827, |
|
"grad_norm": 2.2188355922698975, |
|
"learning_rate": 9.814840501681522e-05, |
|
"loss": 3.1956, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.2696335078534031, |
|
"grad_norm": 2.230815887451172, |
|
"learning_rate": 9.808953787833801e-05, |
|
"loss": 3.1073, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.27225130890052357, |
|
"grad_norm": 2.1100032329559326, |
|
"learning_rate": 9.802976776916494e-05, |
|
"loss": 3.184, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.27486910994764396, |
|
"grad_norm": 2.1523215770721436, |
|
"learning_rate": 9.796909581159116e-05, |
|
"loss": 3.3009, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.2774869109947644, |
|
"grad_norm": 2.2816920280456543, |
|
"learning_rate": 9.790752314484577e-05, |
|
"loss": 3.1125, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.2801047120418848, |
|
"grad_norm": 2.134192705154419, |
|
"learning_rate": 9.784505092507031e-05, |
|
"loss": 3.0986, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.28272251308900526, |
|
"grad_norm": 2.156048536300659, |
|
"learning_rate": 9.778168032529716e-05, |
|
"loss": 3.3802, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.28534031413612565, |
|
"grad_norm": 2.2478604316711426, |
|
"learning_rate": 9.771741253542741e-05, |
|
"loss": 3.2685, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.2879581151832461, |
|
"grad_norm": 2.222820997238159, |
|
"learning_rate": 9.765224876220859e-05, |
|
"loss": 3.1221, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2905759162303665, |
|
"grad_norm": 2.3353006839752197, |
|
"learning_rate": 9.758619022921202e-05, |
|
"loss": 3.1653, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.2931937172774869, |
|
"grad_norm": 14.295672416687012, |
|
"learning_rate": 9.751923817680972e-05, |
|
"loss": 3.146, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.29581151832460734, |
|
"grad_norm": 2.505723714828491, |
|
"learning_rate": 9.745139386215128e-05, |
|
"loss": 3.2138, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.29842931937172773, |
|
"grad_norm": 2.1925199031829834, |
|
"learning_rate": 9.738265855914013e-05, |
|
"loss": 3.1778, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.3010471204188482, |
|
"grad_norm": 2.1371121406555176, |
|
"learning_rate": 9.731303355840968e-05, |
|
"loss": 3.0539, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.3036649214659686, |
|
"grad_norm": 2.133293628692627, |
|
"learning_rate": 9.724252016729909e-05, |
|
"loss": 3.0521, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.306282722513089, |
|
"grad_norm": 2.2210733890533447, |
|
"learning_rate": 9.717111970982869e-05, |
|
"loss": 3.1743, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.3089005235602094, |
|
"grad_norm": 2.2624800205230713, |
|
"learning_rate": 9.709883352667513e-05, |
|
"loss": 3.2274, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.31151832460732987, |
|
"grad_norm": 2.1522083282470703, |
|
"learning_rate": 9.70256629751462e-05, |
|
"loss": 3.1238, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.31413612565445026, |
|
"grad_norm": 1.954613447189331, |
|
"learning_rate": 9.69516094291554e-05, |
|
"loss": 3.0268, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.31675392670157065, |
|
"grad_norm": 1.9362231492996216, |
|
"learning_rate": 9.687667427919605e-05, |
|
"loss": 3.148, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.3193717277486911, |
|
"grad_norm": 2.171438455581665, |
|
"learning_rate": 9.680085893231521e-05, |
|
"loss": 3.069, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.3219895287958115, |
|
"grad_norm": 2.280571699142456, |
|
"learning_rate": 9.672416481208738e-05, |
|
"loss": 3.3274, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.32460732984293195, |
|
"grad_norm": 2.0850417613983154, |
|
"learning_rate": 9.664659335858755e-05, |
|
"loss": 3.2012, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.32722513089005234, |
|
"grad_norm": 2.408496141433716, |
|
"learning_rate": 9.656814602836434e-05, |
|
"loss": 3.2839, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.3298429319371728, |
|
"grad_norm": 120.96231842041016, |
|
"learning_rate": 9.648882429441257e-05, |
|
"loss": 3.1548, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.3324607329842932, |
|
"grad_norm": 2.1636528968811035, |
|
"learning_rate": 9.640862964614564e-05, |
|
"loss": 3.0179, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.33507853403141363, |
|
"grad_norm": 2.2096540927886963, |
|
"learning_rate": 9.632756358936749e-05, |
|
"loss": 3.261, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.337696335078534, |
|
"grad_norm": 2.2390010356903076, |
|
"learning_rate": 9.624562764624445e-05, |
|
"loss": 3.1954, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.3403141361256545, |
|
"grad_norm": 1.8933037519454956, |
|
"learning_rate": 9.616282335527653e-05, |
|
"loss": 3.2055, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.34293193717277487, |
|
"grad_norm": 6.112705230712891, |
|
"learning_rate": 9.607915227126862e-05, |
|
"loss": 3.2929, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.34554973821989526, |
|
"grad_norm": 4.289401054382324, |
|
"learning_rate": 9.599461596530127e-05, |
|
"loss": 3.1634, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.3481675392670157, |
|
"grad_norm": 2.0408754348754883, |
|
"learning_rate": 9.590921602470116e-05, |
|
"loss": 3.1964, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.3507853403141361, |
|
"grad_norm": 2.1104061603546143, |
|
"learning_rate": 9.582295405301131e-05, |
|
"loss": 3.0235, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.35340314136125656, |
|
"grad_norm": 1.9149036407470703, |
|
"learning_rate": 9.573583166996103e-05, |
|
"loss": 3.2854, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.35602094240837695, |
|
"grad_norm": 2.161102056503296, |
|
"learning_rate": 9.564785051143541e-05, |
|
"loss": 3.1318, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.3586387434554974, |
|
"grad_norm": 2.186587333679199, |
|
"learning_rate": 9.555901222944468e-05, |
|
"loss": 3.2125, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.3612565445026178, |
|
"grad_norm": 1.8297839164733887, |
|
"learning_rate": 9.546931849209314e-05, |
|
"loss": 2.96, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.36387434554973824, |
|
"grad_norm": 2.2810025215148926, |
|
"learning_rate": 9.537877098354786e-05, |
|
"loss": 3.0554, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.36649214659685864, |
|
"grad_norm": 2.006052017211914, |
|
"learning_rate": 9.528737140400707e-05, |
|
"loss": 3.1757, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.36910994764397903, |
|
"grad_norm": 2.3248322010040283, |
|
"learning_rate": 9.519512146966823e-05, |
|
"loss": 3.2321, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.3717277486910995, |
|
"grad_norm": 2.1551225185394287, |
|
"learning_rate": 9.510202291269576e-05, |
|
"loss": 3.0333, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.3743455497382199, |
|
"grad_norm": 2.613978147506714, |
|
"learning_rate": 9.500807748118856e-05, |
|
"loss": 3.3177, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.3769633507853403, |
|
"grad_norm": 2.302820920944214, |
|
"learning_rate": 9.491328693914722e-05, |
|
"loss": 3.069, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.3795811518324607, |
|
"grad_norm": 2.0801970958709717, |
|
"learning_rate": 9.48176530664408e-05, |
|
"loss": 3.1207, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.38219895287958117, |
|
"grad_norm": 1.8670393228530884, |
|
"learning_rate": 9.472117765877349e-05, |
|
"loss": 2.988, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.38481675392670156, |
|
"grad_norm": 2.123073101043701, |
|
"learning_rate": 9.462386252765087e-05, |
|
"loss": 3.1316, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.387434554973822, |
|
"grad_norm": 2.0116608142852783, |
|
"learning_rate": 9.452570950034589e-05, |
|
"loss": 3.1323, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.3900523560209424, |
|
"grad_norm": 1.8206120729446411, |
|
"learning_rate": 9.442672041986457e-05, |
|
"loss": 2.8792, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.39267015706806285, |
|
"grad_norm": 1.9137699604034424, |
|
"learning_rate": 9.432689714491136e-05, |
|
"loss": 3.0191, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.39528795811518325, |
|
"grad_norm": 1.9897340536117554, |
|
"learning_rate": 9.422624154985427e-05, |
|
"loss": 3.1578, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.39790575916230364, |
|
"grad_norm": 1.8997538089752197, |
|
"learning_rate": 9.412475552468974e-05, |
|
"loss": 3.0389, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.4005235602094241, |
|
"grad_norm": 2.249044895172119, |
|
"learning_rate": 9.402244097500696e-05, |
|
"loss": 3.3441, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.4031413612565445, |
|
"grad_norm": 1.9846928119659424, |
|
"learning_rate": 9.391929982195232e-05, |
|
"loss": 3.2146, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.40575916230366493, |
|
"grad_norm": 2.240514039993286, |
|
"learning_rate": 9.381533400219318e-05, |
|
"loss": 3.2069, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.4083769633507853, |
|
"grad_norm": 2.115003824234009, |
|
"learning_rate": 9.371054546788157e-05, |
|
"loss": 2.9426, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.4109947643979058, |
|
"grad_norm": 1.9737358093261719, |
|
"learning_rate": 9.36049361866175e-05, |
|
"loss": 3.1503, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.41361256544502617, |
|
"grad_norm": 2.096264600753784, |
|
"learning_rate": 9.349850814141204e-05, |
|
"loss": 3.0736, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.4162303664921466, |
|
"grad_norm": 2.3380467891693115, |
|
"learning_rate": 9.339126333065007e-05, |
|
"loss": 3.2136, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.418848167539267, |
|
"grad_norm": 2.1407546997070312, |
|
"learning_rate": 9.328320376805281e-05, |
|
"loss": 3.1564, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4214659685863874, |
|
"grad_norm": 1.9080549478530884, |
|
"learning_rate": 9.317433148263995e-05, |
|
"loss": 2.9173, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.42408376963350786, |
|
"grad_norm": 4.025334358215332, |
|
"learning_rate": 9.30646485186915e-05, |
|
"loss": 2.8801, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.42670157068062825, |
|
"grad_norm": 2.0670013427734375, |
|
"learning_rate": 9.295415693570955e-05, |
|
"loss": 2.9837, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.4293193717277487, |
|
"grad_norm": 2.1820228099823, |
|
"learning_rate": 9.284285880837946e-05, |
|
"loss": 3.1169, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.4319371727748691, |
|
"grad_norm": 1.9107846021652222, |
|
"learning_rate": 9.273075622653102e-05, |
|
"loss": 3.1854, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.43455497382198954, |
|
"grad_norm": 2.0229499340057373, |
|
"learning_rate": 9.261785129509914e-05, |
|
"loss": 2.9287, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.43717277486910994, |
|
"grad_norm": 1.9189234972000122, |
|
"learning_rate": 9.250414613408427e-05, |
|
"loss": 3.0001, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.4397905759162304, |
|
"grad_norm": 2.0910720825195312, |
|
"learning_rate": 9.238964287851275e-05, |
|
"loss": 2.9357, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.4424083769633508, |
|
"grad_norm": 1.9052855968475342, |
|
"learning_rate": 9.22743436783966e-05, |
|
"loss": 2.9641, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.44502617801047123, |
|
"grad_norm": 1.9111841917037964, |
|
"learning_rate": 9.215825069869316e-05, |
|
"loss": 3.0879, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.4476439790575916, |
|
"grad_norm": 1.898743987083435, |
|
"learning_rate": 9.20413661192645e-05, |
|
"loss": 2.9687, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.450261780104712, |
|
"grad_norm": 1.9681291580200195, |
|
"learning_rate": 9.192369213483642e-05, |
|
"loss": 3.0313, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.45287958115183247, |
|
"grad_norm": 1.937829613685608, |
|
"learning_rate": 9.180523095495727e-05, |
|
"loss": 2.9251, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.45549738219895286, |
|
"grad_norm": 2.15289568901062, |
|
"learning_rate": 9.168598480395651e-05, |
|
"loss": 3.0487, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.4581151832460733, |
|
"grad_norm": 2.0502867698669434, |
|
"learning_rate": 9.156595592090284e-05, |
|
"loss": 2.8817, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.4607329842931937, |
|
"grad_norm": 2.0720651149749756, |
|
"learning_rate": 9.14451465595622e-05, |
|
"loss": 3.009, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.46335078534031415, |
|
"grad_norm": 2.0610387325286865, |
|
"learning_rate": 9.132355898835556e-05, |
|
"loss": 3.0353, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.46596858638743455, |
|
"grad_norm": 2.160541534423828, |
|
"learning_rate": 9.12011954903161e-05, |
|
"loss": 3.1546, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.468586387434555, |
|
"grad_norm": 2.1311235427856445, |
|
"learning_rate": 9.107805836304658e-05, |
|
"loss": 3.0889, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.4712041884816754, |
|
"grad_norm": 1.902626633644104, |
|
"learning_rate": 9.095414991867604e-05, |
|
"loss": 2.9551, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4738219895287958, |
|
"grad_norm": 1.9568802118301392, |
|
"learning_rate": 9.082947248381643e-05, |
|
"loss": 2.9683, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.47643979057591623, |
|
"grad_norm": 2.024705171585083, |
|
"learning_rate": 9.070402839951897e-05, |
|
"loss": 3.0207, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.4790575916230366, |
|
"grad_norm": 1.8830217123031616, |
|
"learning_rate": 9.057782002123012e-05, |
|
"loss": 2.8518, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.4816753926701571, |
|
"grad_norm": 2.031515598297119, |
|
"learning_rate": 9.045084971874738e-05, |
|
"loss": 2.9262, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.48429319371727747, |
|
"grad_norm": 1.9783765077590942, |
|
"learning_rate": 9.03231198761748e-05, |
|
"loss": 2.9476, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.4869109947643979, |
|
"grad_norm": 1.9431626796722412, |
|
"learning_rate": 9.019463289187827e-05, |
|
"loss": 2.8431, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.4895287958115183, |
|
"grad_norm": 1.8779281377792358, |
|
"learning_rate": 9.00653911784403e-05, |
|
"loss": 3.0361, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.49214659685863876, |
|
"grad_norm": 1.810121774673462, |
|
"learning_rate": 8.993539716261498e-05, |
|
"loss": 2.847, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.49476439790575916, |
|
"grad_norm": 2.5064663887023926, |
|
"learning_rate": 8.980465328528219e-05, |
|
"loss": 2.9118, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.4973821989528796, |
|
"grad_norm": 1.9613292217254639, |
|
"learning_rate": 8.96731620014019e-05, |
|
"loss": 2.8882, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 2.058799982070923, |
|
"learning_rate": 8.954092577996803e-05, |
|
"loss": 3.0066, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.5026178010471204, |
|
"grad_norm": 6.452920436859131, |
|
"learning_rate": 8.940794710396205e-05, |
|
"loss": 3.005, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.5052356020942408, |
|
"grad_norm": 2.1949808597564697, |
|
"learning_rate": 8.927422847030646e-05, |
|
"loss": 2.9602, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.5078534031413613, |
|
"grad_norm": 2.0062756538391113, |
|
"learning_rate": 8.913977238981778e-05, |
|
"loss": 2.9961, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.5104712041884817, |
|
"grad_norm": 1.8738442659378052, |
|
"learning_rate": 8.900458138715954e-05, |
|
"loss": 3.0411, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.5130890052356021, |
|
"grad_norm": 1.8305824995040894, |
|
"learning_rate": 8.886865800079474e-05, |
|
"loss": 3.0892, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.5157068062827225, |
|
"grad_norm": 1.8090300559997559, |
|
"learning_rate": 8.873200478293826e-05, |
|
"loss": 2.7575, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.518324607329843, |
|
"grad_norm": 1.870323657989502, |
|
"learning_rate": 8.859462429950897e-05, |
|
"loss": 3.054, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.5209424083769634, |
|
"grad_norm": 1.965692162513733, |
|
"learning_rate": 8.845651913008145e-05, |
|
"loss": 2.8143, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.5235602094240838, |
|
"grad_norm": 1.88938570022583, |
|
"learning_rate": 8.831769186783765e-05, |
|
"loss": 2.9259, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5261780104712042, |
|
"grad_norm": 1.8629285097122192, |
|
"learning_rate": 8.817814511951814e-05, |
|
"loss": 2.8782, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.5287958115183246, |
|
"grad_norm": 1.873532772064209, |
|
"learning_rate": 8.80378815053732e-05, |
|
"loss": 2.8025, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.5314136125654451, |
|
"grad_norm": 2.1665189266204834, |
|
"learning_rate": 8.789690365911356e-05, |
|
"loss": 3.0567, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.5340314136125655, |
|
"grad_norm": 1.980928659439087, |
|
"learning_rate": 8.775521422786104e-05, |
|
"loss": 2.9795, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.5366492146596858, |
|
"grad_norm": 2.0913095474243164, |
|
"learning_rate": 8.761281587209876e-05, |
|
"loss": 3.0988, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.5392670157068062, |
|
"grad_norm": 1.8205846548080444, |
|
"learning_rate": 8.746971126562124e-05, |
|
"loss": 2.9828, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.5418848167539267, |
|
"grad_norm": 2.1041507720947266, |
|
"learning_rate": 8.732590309548416e-05, |
|
"loss": 3.0541, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.5445026178010471, |
|
"grad_norm": 2.023577928543091, |
|
"learning_rate": 8.718139406195393e-05, |
|
"loss": 3.0061, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.5471204188481675, |
|
"grad_norm": 1.8006081581115723, |
|
"learning_rate": 8.703618687845696e-05, |
|
"loss": 2.789, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.5497382198952879, |
|
"grad_norm": 1.840972900390625, |
|
"learning_rate": 8.689028427152874e-05, |
|
"loss": 2.9437, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5523560209424084, |
|
"grad_norm": 1.9612823724746704, |
|
"learning_rate": 8.674368898076261e-05, |
|
"loss": 3.1407, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.5549738219895288, |
|
"grad_norm": 2.0491254329681396, |
|
"learning_rate": 8.65964037587584e-05, |
|
"loss": 2.9557, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.5575916230366492, |
|
"grad_norm": 2.0597126483917236, |
|
"learning_rate": 8.644843137107059e-05, |
|
"loss": 2.9996, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.5602094240837696, |
|
"grad_norm": 1.7746310234069824, |
|
"learning_rate": 8.629977459615655e-05, |
|
"loss": 3.0459, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.56282722513089, |
|
"grad_norm": 1.892563819885254, |
|
"learning_rate": 8.615043622532429e-05, |
|
"loss": 2.9831, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.5654450261780105, |
|
"grad_norm": 1.7289716005325317, |
|
"learning_rate": 8.600041906268e-05, |
|
"loss": 2.7416, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.5680628272251309, |
|
"grad_norm": 1.929553508758545, |
|
"learning_rate": 8.584972592507553e-05, |
|
"loss": 2.9513, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.5706806282722513, |
|
"grad_norm": 1.957163691520691, |
|
"learning_rate": 8.569835964205536e-05, |
|
"loss": 2.7647, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.5732984293193717, |
|
"grad_norm": 1.8348171710968018, |
|
"learning_rate": 8.554632305580354e-05, |
|
"loss": 2.9897, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.5759162303664922, |
|
"grad_norm": 1.811667561531067, |
|
"learning_rate": 8.539361902109033e-05, |
|
"loss": 2.8322, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5785340314136126, |
|
"grad_norm": 2.0632617473602295, |
|
"learning_rate": 8.524025040521856e-05, |
|
"loss": 2.9788, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.581151832460733, |
|
"grad_norm": 1.847805142402649, |
|
"learning_rate": 8.508622008796985e-05, |
|
"loss": 2.8168, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.5837696335078534, |
|
"grad_norm": 1.7859346866607666, |
|
"learning_rate": 8.493153096155042e-05, |
|
"loss": 2.819, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.5863874345549738, |
|
"grad_norm": 1.756142020225525, |
|
"learning_rate": 8.477618593053693e-05, |
|
"loss": 2.9083, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.5890052356020943, |
|
"grad_norm": 1.7891409397125244, |
|
"learning_rate": 8.462018791182184e-05, |
|
"loss": 2.911, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.5916230366492147, |
|
"grad_norm": 1.923041582107544, |
|
"learning_rate": 8.44635398345587e-05, |
|
"loss": 2.7662, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.5942408376963351, |
|
"grad_norm": 2.131024122238159, |
|
"learning_rate": 8.430624464010706e-05, |
|
"loss": 2.8831, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.5968586387434555, |
|
"grad_norm": 1.9196547269821167, |
|
"learning_rate": 8.414830528197737e-05, |
|
"loss": 2.875, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.599476439790576, |
|
"grad_norm": 1.9374908208847046, |
|
"learning_rate": 8.39897247257754e-05, |
|
"loss": 3.0421, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.6020942408376964, |
|
"grad_norm": 1.9606311321258545, |
|
"learning_rate": 8.383050594914665e-05, |
|
"loss": 2.9347, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.6047120418848168, |
|
"grad_norm": 1.8933106660842896, |
|
"learning_rate": 8.367065194172037e-05, |
|
"loss": 2.9623, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.6073298429319371, |
|
"grad_norm": 2.1029906272888184, |
|
"learning_rate": 8.351016570505347e-05, |
|
"loss": 2.8704, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.6099476439790575, |
|
"grad_norm": 1.86416494846344, |
|
"learning_rate": 8.334905025257413e-05, |
|
"loss": 3.0935, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.612565445026178, |
|
"grad_norm": 1.8532062768936157, |
|
"learning_rate": 8.318730860952522e-05, |
|
"loss": 2.7635, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.6151832460732984, |
|
"grad_norm": 2.0105836391448975, |
|
"learning_rate": 8.302494381290756e-05, |
|
"loss": 3.0954, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.6178010471204188, |
|
"grad_norm": 2.164985418319702, |
|
"learning_rate": 8.286195891142274e-05, |
|
"loss": 2.9424, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.6204188481675392, |
|
"grad_norm": 1.9459847211837769, |
|
"learning_rate": 8.269835696541607e-05, |
|
"loss": 2.8438, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.6230366492146597, |
|
"grad_norm": 3.4676437377929688, |
|
"learning_rate": 8.253414104681898e-05, |
|
"loss": 3.0494, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.6256544502617801, |
|
"grad_norm": 1.8192616701126099, |
|
"learning_rate": 8.236931423909138e-05, |
|
"loss": 2.7965, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.6282722513089005, |
|
"grad_norm": 1.8246678113937378, |
|
"learning_rate": 8.220387963716377e-05, |
|
"loss": 2.8788, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6308900523560209, |
|
"grad_norm": 1.8622642755508423, |
|
"learning_rate": 8.20378403473791e-05, |
|
"loss": 3.0934, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.6335078534031413, |
|
"grad_norm": 1.927139163017273, |
|
"learning_rate": 8.18711994874345e-05, |
|
"loss": 2.9704, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.6361256544502618, |
|
"grad_norm": 1.7952053546905518, |
|
"learning_rate": 8.170396018632264e-05, |
|
"loss": 2.8313, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.6387434554973822, |
|
"grad_norm": 1.8302192687988281, |
|
"learning_rate": 8.153612558427311e-05, |
|
"loss": 2.8198, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.6413612565445026, |
|
"grad_norm": 1.809410810470581, |
|
"learning_rate": 8.13676988326933e-05, |
|
"loss": 2.6532, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.643979057591623, |
|
"grad_norm": 1.8222423791885376, |
|
"learning_rate": 8.119868309410943e-05, |
|
"loss": 2.9867, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.6465968586387435, |
|
"grad_norm": 7.688499927520752, |
|
"learning_rate": 8.102908154210693e-05, |
|
"loss": 2.8391, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.6492146596858639, |
|
"grad_norm": 1.8286641836166382, |
|
"learning_rate": 8.085889736127103e-05, |
|
"loss": 2.9778, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.6518324607329843, |
|
"grad_norm": 1.816272258758545, |
|
"learning_rate": 8.068813374712688e-05, |
|
"loss": 2.8969, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.6544502617801047, |
|
"grad_norm": 2.080170154571533, |
|
"learning_rate": 8.05167939060796e-05, |
|
"loss": 2.9861, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6570680628272252, |
|
"grad_norm": 1.8223365545272827, |
|
"learning_rate": 8.0344881055354e-05, |
|
"loss": 2.9545, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.6596858638743456, |
|
"grad_norm": 1.891790509223938, |
|
"learning_rate": 8.017239842293427e-05, |
|
"loss": 2.8586, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.662303664921466, |
|
"grad_norm": 1.8906011581420898, |
|
"learning_rate": 7.999934924750325e-05, |
|
"loss": 2.94, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.6649214659685864, |
|
"grad_norm": 1.8247802257537842, |
|
"learning_rate": 7.982573677838172e-05, |
|
"loss": 2.6747, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.6675392670157068, |
|
"grad_norm": 1.8509819507598877, |
|
"learning_rate": 7.965156427546735e-05, |
|
"loss": 2.8795, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.6701570680628273, |
|
"grad_norm": 1.796658992767334, |
|
"learning_rate": 7.947683500917347e-05, |
|
"loss": 2.8549, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.6727748691099477, |
|
"grad_norm": 1.771183729171753, |
|
"learning_rate": 7.93015522603677e-05, |
|
"loss": 2.9275, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.675392670157068, |
|
"grad_norm": 1.8382376432418823, |
|
"learning_rate": 7.91257193203103e-05, |
|
"loss": 3.0237, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.6780104712041884, |
|
"grad_norm": 1.8723816871643066, |
|
"learning_rate": 7.894933949059245e-05, |
|
"loss": 2.6623, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.680628272251309, |
|
"grad_norm": 1.9213804006576538, |
|
"learning_rate": 7.877241608307411e-05, |
|
"loss": 2.8994, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6832460732984293, |
|
"grad_norm": 1.7478219270706177, |
|
"learning_rate": 7.8594952419822e-05, |
|
"loss": 2.7949, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.6858638743455497, |
|
"grad_norm": 1.8188650608062744, |
|
"learning_rate": 7.841695183304713e-05, |
|
"loss": 2.8178, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.6884816753926701, |
|
"grad_norm": 1.8901054859161377, |
|
"learning_rate": 7.823841766504227e-05, |
|
"loss": 2.9314, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.6910994764397905, |
|
"grad_norm": 1.9486525058746338, |
|
"learning_rate": 7.805935326811912e-05, |
|
"loss": 2.7373, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.693717277486911, |
|
"grad_norm": 1.7464019060134888, |
|
"learning_rate": 7.787976200454546e-05, |
|
"loss": 2.8273, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.6963350785340314, |
|
"grad_norm": 1.8585435152053833, |
|
"learning_rate": 7.769964724648196e-05, |
|
"loss": 2.8128, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.6989528795811518, |
|
"grad_norm": 1.7956280708312988, |
|
"learning_rate": 7.751901237591887e-05, |
|
"loss": 2.7562, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.7015706806282722, |
|
"grad_norm": 2.0384879112243652, |
|
"learning_rate": 7.733786078461252e-05, |
|
"loss": 2.9731, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.7041884816753927, |
|
"grad_norm": 1.7548547983169556, |
|
"learning_rate": 7.715619587402164e-05, |
|
"loss": 2.834, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.7068062827225131, |
|
"grad_norm": 1.651876449584961, |
|
"learning_rate": 7.697402105524351e-05, |
|
"loss": 2.7768, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.7094240837696335, |
|
"grad_norm": 1.8058605194091797, |
|
"learning_rate": 7.679133974894983e-05, |
|
"loss": 2.7503, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.7120418848167539, |
|
"grad_norm": 1.921723484992981, |
|
"learning_rate": 7.66081553853226e-05, |
|
"loss": 2.7922, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.7146596858638743, |
|
"grad_norm": 1.990212321281433, |
|
"learning_rate": 7.642447140398965e-05, |
|
"loss": 2.8968, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.7172774869109948, |
|
"grad_norm": 1.8311606645584106, |
|
"learning_rate": 7.624029125396004e-05, |
|
"loss": 2.9963, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.7198952879581152, |
|
"grad_norm": 2.0072896480560303, |
|
"learning_rate": 7.605561839355933e-05, |
|
"loss": 2.875, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.7225130890052356, |
|
"grad_norm": 1.9953948259353638, |
|
"learning_rate": 7.587045629036463e-05, |
|
"loss": 2.9325, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.725130890052356, |
|
"grad_norm": 1.8839069604873657, |
|
"learning_rate": 7.568480842113952e-05, |
|
"loss": 2.682, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.7277486910994765, |
|
"grad_norm": 1.6751598119735718, |
|
"learning_rate": 7.549867827176873e-05, |
|
"loss": 2.7488, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.7303664921465969, |
|
"grad_norm": 1.7167940139770508, |
|
"learning_rate": 7.53120693371927e-05, |
|
"loss": 2.9112, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.7329842931937173, |
|
"grad_norm": 1.9378173351287842, |
|
"learning_rate": 7.512498512134194e-05, |
|
"loss": 2.9304, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7356020942408377, |
|
"grad_norm": 1.7746607065200806, |
|
"learning_rate": 7.493742913707127e-05, |
|
"loss": 2.9898, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.7382198952879581, |
|
"grad_norm": 1.9426668882369995, |
|
"learning_rate": 7.474940490609383e-05, |
|
"loss": 2.9828, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.7408376963350786, |
|
"grad_norm": 1.8619779348373413, |
|
"learning_rate": 7.456091595891498e-05, |
|
"loss": 2.7965, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.743455497382199, |
|
"grad_norm": 1.691998839378357, |
|
"learning_rate": 7.437196583476596e-05, |
|
"loss": 2.8217, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.7460732984293194, |
|
"grad_norm": 1.8032022714614868, |
|
"learning_rate": 7.41825580815375e-05, |
|
"loss": 2.7456, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.7486910994764397, |
|
"grad_norm": 1.8445478677749634, |
|
"learning_rate": 7.399269625571316e-05, |
|
"loss": 2.855, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.7513089005235603, |
|
"grad_norm": 1.69001305103302, |
|
"learning_rate": 7.380238392230257e-05, |
|
"loss": 2.6786, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.7539267015706806, |
|
"grad_norm": 1.8098856210708618, |
|
"learning_rate": 7.361162465477442e-05, |
|
"loss": 2.8392, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.756544502617801, |
|
"grad_norm": 2.023061752319336, |
|
"learning_rate": 7.342042203498951e-05, |
|
"loss": 3.1742, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.7591623036649214, |
|
"grad_norm": 2.027721405029297, |
|
"learning_rate": 7.322877965313335e-05, |
|
"loss": 2.707, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7617801047120419, |
|
"grad_norm": 2.092510938644409, |
|
"learning_rate": 7.303670110764881e-05, |
|
"loss": 2.9764, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.7643979057591623, |
|
"grad_norm": 1.794023871421814, |
|
"learning_rate": 7.284419000516855e-05, |
|
"loss": 2.8471, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.7670157068062827, |
|
"grad_norm": 1.998582124710083, |
|
"learning_rate": 7.26512499604473e-05, |
|
"loss": 2.8464, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.7696335078534031, |
|
"grad_norm": 1.9297953844070435, |
|
"learning_rate": 7.245788459629396e-05, |
|
"loss": 2.6567, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.7722513089005235, |
|
"grad_norm": 1.889047622680664, |
|
"learning_rate": 7.226409754350361e-05, |
|
"loss": 2.8468, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.774869109947644, |
|
"grad_norm": 1.806944489479065, |
|
"learning_rate": 7.206989244078934e-05, |
|
"loss": 2.7286, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.7774869109947644, |
|
"grad_norm": 1.7581151723861694, |
|
"learning_rate": 7.187527293471385e-05, |
|
"loss": 2.7098, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.7801047120418848, |
|
"grad_norm": 1.8031219244003296, |
|
"learning_rate": 7.168024267962111e-05, |
|
"loss": 2.746, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.7827225130890052, |
|
"grad_norm": 1.743693470954895, |
|
"learning_rate": 7.14848053375676e-05, |
|
"loss": 2.9825, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.7853403141361257, |
|
"grad_norm": 1.8501001596450806, |
|
"learning_rate": 7.128896457825364e-05, |
|
"loss": 2.7572, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7879581151832461, |
|
"grad_norm": 1.8102787733078003, |
|
"learning_rate": 7.109272407895449e-05, |
|
"loss": 2.7863, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.7905759162303665, |
|
"grad_norm": 1.8646475076675415, |
|
"learning_rate": 7.089608752445121e-05, |
|
"loss": 2.8223, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.7931937172774869, |
|
"grad_norm": 1.9499090909957886, |
|
"learning_rate": 7.069905860696162e-05, |
|
"loss": 2.7525, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.7958115183246073, |
|
"grad_norm": 2.4195919036865234, |
|
"learning_rate": 7.05016410260708e-05, |
|
"loss": 2.7437, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.7984293193717278, |
|
"grad_norm": 2.0008833408355713, |
|
"learning_rate": 7.030383848866177e-05, |
|
"loss": 2.8387, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.8010471204188482, |
|
"grad_norm": 2.05761456489563, |
|
"learning_rate": 7.010565470884582e-05, |
|
"loss": 2.8233, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.8036649214659686, |
|
"grad_norm": 1.608881950378418, |
|
"learning_rate": 6.990709340789273e-05, |
|
"loss": 2.8514, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.806282722513089, |
|
"grad_norm": 1.8444130420684814, |
|
"learning_rate": 6.970815831416099e-05, |
|
"loss": 2.784, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.8089005235602095, |
|
"grad_norm": 1.946343183517456, |
|
"learning_rate": 6.950885316302773e-05, |
|
"loss": 2.6383, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.8115183246073299, |
|
"grad_norm": 1.7848812341690063, |
|
"learning_rate": 6.93091816968186e-05, |
|
"loss": 2.7524, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.8141361256544503, |
|
"grad_norm": 1.8199461698532104, |
|
"learning_rate": 6.910914766473749e-05, |
|
"loss": 2.7292, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.8167539267015707, |
|
"grad_norm": 1.8317044973373413, |
|
"learning_rate": 6.890875482279614e-05, |
|
"loss": 2.7381, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.819371727748691, |
|
"grad_norm": 1.7864313125610352, |
|
"learning_rate": 6.870800693374364e-05, |
|
"loss": 2.7642, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.8219895287958116, |
|
"grad_norm": 1.8122559785842896, |
|
"learning_rate": 6.850690776699573e-05, |
|
"loss": 2.6585, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.824607329842932, |
|
"grad_norm": 1.8325432538986206, |
|
"learning_rate": 6.830546109856401e-05, |
|
"loss": 2.7378, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.8272251308900523, |
|
"grad_norm": 1.7728067636489868, |
|
"learning_rate": 6.810367071098516e-05, |
|
"loss": 2.8454, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.8298429319371727, |
|
"grad_norm": 1.841179609298706, |
|
"learning_rate": 6.790154039324975e-05, |
|
"loss": 2.6204, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.8324607329842932, |
|
"grad_norm": 1.6985840797424316, |
|
"learning_rate": 6.769907394073117e-05, |
|
"loss": 2.905, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.8350785340314136, |
|
"grad_norm": 1.8315932750701904, |
|
"learning_rate": 6.749627515511442e-05, |
|
"loss": 2.7492, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.837696335078534, |
|
"grad_norm": 1.8084025382995605, |
|
"learning_rate": 6.729314784432465e-05, |
|
"loss": 2.752, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8403141361256544, |
|
"grad_norm": 1.8621736764907837, |
|
"learning_rate": 6.708969582245568e-05, |
|
"loss": 2.6648, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.8429319371727748, |
|
"grad_norm": 1.7286403179168701, |
|
"learning_rate": 6.688592290969837e-05, |
|
"loss": 2.5931, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.8455497382198953, |
|
"grad_norm": 1.67673921585083, |
|
"learning_rate": 6.668183293226891e-05, |
|
"loss": 2.8364, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.8481675392670157, |
|
"grad_norm": 1.9970136880874634, |
|
"learning_rate": 6.647742972233703e-05, |
|
"loss": 2.888, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.8507853403141361, |
|
"grad_norm": 1.8752323389053345, |
|
"learning_rate": 6.627271711795386e-05, |
|
"loss": 3.0591, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.8534031413612565, |
|
"grad_norm": 1.7339110374450684, |
|
"learning_rate": 6.606769896298014e-05, |
|
"loss": 2.8724, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.856020942408377, |
|
"grad_norm": 1.8849775791168213, |
|
"learning_rate": 6.586237910701374e-05, |
|
"loss": 2.8541, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.8586387434554974, |
|
"grad_norm": 1.8441506624221802, |
|
"learning_rate": 6.565676140531764e-05, |
|
"loss": 2.9447, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.8612565445026178, |
|
"grad_norm": 1.7712169885635376, |
|
"learning_rate": 6.545084971874738e-05, |
|
"loss": 2.6989, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.8638743455497382, |
|
"grad_norm": 1.7494961023330688, |
|
"learning_rate": 6.524464791367861e-05, |
|
"loss": 2.7702, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8664921465968587, |
|
"grad_norm": 2.063343048095703, |
|
"learning_rate": 6.503815986193456e-05, |
|
"loss": 2.746, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.8691099476439791, |
|
"grad_norm": 1.89371919631958, |
|
"learning_rate": 6.483138944071316e-05, |
|
"loss": 2.7176, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.8717277486910995, |
|
"grad_norm": 2.864008665084839, |
|
"learning_rate": 6.462434053251446e-05, |
|
"loss": 2.6897, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.8743455497382199, |
|
"grad_norm": 1.761522650718689, |
|
"learning_rate": 6.441701702506754e-05, |
|
"loss": 2.5764, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.8769633507853403, |
|
"grad_norm": 1.726992130279541, |
|
"learning_rate": 6.420942281125765e-05, |
|
"loss": 2.6313, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.8795811518324608, |
|
"grad_norm": 1.711841106414795, |
|
"learning_rate": 6.400156178905308e-05, |
|
"loss": 2.7067, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.8821989528795812, |
|
"grad_norm": 1.8135813474655151, |
|
"learning_rate": 6.379343786143184e-05, |
|
"loss": 2.78, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.8848167539267016, |
|
"grad_norm": 1.6920337677001953, |
|
"learning_rate": 6.358505493630858e-05, |
|
"loss": 2.8199, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.887434554973822, |
|
"grad_norm": 1.7479579448699951, |
|
"learning_rate": 6.337641692646106e-05, |
|
"loss": 2.8199, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.8900523560209425, |
|
"grad_norm": 1.778996467590332, |
|
"learning_rate": 6.316752774945673e-05, |
|
"loss": 2.6521, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8926701570680629, |
|
"grad_norm": 1.964356541633606, |
|
"learning_rate": 6.295839132757919e-05, |
|
"loss": 2.7588, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.8952879581151832, |
|
"grad_norm": 1.6494593620300293, |
|
"learning_rate": 6.274901158775454e-05, |
|
"loss": 2.7191, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.8979057591623036, |
|
"grad_norm": 1.7057850360870361, |
|
"learning_rate": 6.25393924614776e-05, |
|
"loss": 2.5695, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.900523560209424, |
|
"grad_norm": 1.83281672000885, |
|
"learning_rate": 6.232953788473811e-05, |
|
"loss": 2.7329, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.9031413612565445, |
|
"grad_norm": 1.7066670656204224, |
|
"learning_rate": 6.211945179794684e-05, |
|
"loss": 2.6925, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.9057591623036649, |
|
"grad_norm": 1.7664719820022583, |
|
"learning_rate": 6.190913814586162e-05, |
|
"loss": 2.8158, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.9083769633507853, |
|
"grad_norm": 1.8208638429641724, |
|
"learning_rate": 6.169860087751321e-05, |
|
"loss": 2.6722, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.9109947643979057, |
|
"grad_norm": 1.891035556793213, |
|
"learning_rate": 6.148784394613119e-05, |
|
"loss": 2.7744, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.9136125654450262, |
|
"grad_norm": 1.7416268587112427, |
|
"learning_rate": 6.127687130906972e-05, |
|
"loss": 2.5742, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.9162303664921466, |
|
"grad_norm": 2.1573400497436523, |
|
"learning_rate": 6.106568692773324e-05, |
|
"loss": 2.7032, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.918848167539267, |
|
"grad_norm": 1.9572385549545288, |
|
"learning_rate": 6.0854294767502084e-05, |
|
"loss": 2.7508, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.9214659685863874, |
|
"grad_norm": 1.7982288599014282, |
|
"learning_rate": 6.064269879765805e-05, |
|
"loss": 2.8771, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.9240837696335078, |
|
"grad_norm": 1.6815402507781982, |
|
"learning_rate": 6.043090299130978e-05, |
|
"loss": 2.7996, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.9267015706806283, |
|
"grad_norm": 1.8650509119033813, |
|
"learning_rate": 6.021891132531825e-05, |
|
"loss": 2.7655, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.9293193717277487, |
|
"grad_norm": 1.7183104753494263, |
|
"learning_rate": 6.000672778022208e-05, |
|
"loss": 2.6123, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.9319371727748691, |
|
"grad_norm": 1.7947955131530762, |
|
"learning_rate": 5.979435634016277e-05, |
|
"loss": 2.6827, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.9345549738219895, |
|
"grad_norm": 1.6206470727920532, |
|
"learning_rate": 5.95818009928099e-05, |
|
"loss": 2.5481, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.93717277486911, |
|
"grad_norm": 1.8061578273773193, |
|
"learning_rate": 5.9369065729286245e-05, |
|
"loss": 2.7013, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.9397905759162304, |
|
"grad_norm": 1.7426854372024536, |
|
"learning_rate": 5.9156154544092815e-05, |
|
"loss": 2.8558, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.9424083769633508, |
|
"grad_norm": 1.7318534851074219, |
|
"learning_rate": 5.894307143503393e-05, |
|
"loss": 2.8222, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9450261780104712, |
|
"grad_norm": 1.6571924686431885, |
|
"learning_rate": 5.8729820403142054e-05, |
|
"loss": 2.7089, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.9476439790575916, |
|
"grad_norm": 1.6273107528686523, |
|
"learning_rate": 5.851640545260276e-05, |
|
"loss": 2.5964, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.9502617801047121, |
|
"grad_norm": 1.7294092178344727, |
|
"learning_rate": 5.830283059067947e-05, |
|
"loss": 2.6222, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.9528795811518325, |
|
"grad_norm": 1.631488561630249, |
|
"learning_rate": 5.808909982763825e-05, |
|
"loss": 2.711, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.9554973821989529, |
|
"grad_norm": 1.7980823516845703, |
|
"learning_rate": 5.787521717667247e-05, |
|
"loss": 2.5482, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.9581151832460733, |
|
"grad_norm": 1.5978546142578125, |
|
"learning_rate": 5.7661186653827535e-05, |
|
"loss": 2.6563, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.9607329842931938, |
|
"grad_norm": 1.7057075500488281, |
|
"learning_rate": 5.744701227792538e-05, |
|
"loss": 2.7849, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.9633507853403142, |
|
"grad_norm": 1.9055167436599731, |
|
"learning_rate": 5.7232698070489065e-05, |
|
"loss": 2.8311, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.9659685863874345, |
|
"grad_norm": 1.6838874816894531, |
|
"learning_rate": 5.701824805566722e-05, |
|
"loss": 2.6986, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.9685863874345549, |
|
"grad_norm": 1.7164732217788696, |
|
"learning_rate": 5.680366626015855e-05, |
|
"loss": 2.7211, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.9712041884816754, |
|
"grad_norm": 4.532098770141602, |
|
"learning_rate": 5.658895671313619e-05, |
|
"loss": 2.6758, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.9738219895287958, |
|
"grad_norm": 1.914410948753357, |
|
"learning_rate": 5.6374123446172e-05, |
|
"loss": 2.8492, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.9764397905759162, |
|
"grad_norm": 1.6964002847671509, |
|
"learning_rate": 5.615917049316095e-05, |
|
"loss": 2.6859, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.9790575916230366, |
|
"grad_norm": 1.863756537437439, |
|
"learning_rate": 5.5944101890245324e-05, |
|
"loss": 2.6823, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.981675392670157, |
|
"grad_norm": 1.729614496231079, |
|
"learning_rate": 5.5728921675738964e-05, |
|
"loss": 2.6016, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.9842931937172775, |
|
"grad_norm": 1.7206858396530151, |
|
"learning_rate": 5.551363389005144e-05, |
|
"loss": 2.6316, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.9869109947643979, |
|
"grad_norm": 1.6375540494918823, |
|
"learning_rate": 5.529824257561212e-05, |
|
"loss": 2.5566, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.9895287958115183, |
|
"grad_norm": 1.7328511476516724, |
|
"learning_rate": 5.508275177679436e-05, |
|
"loss": 2.7835, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.9921465968586387, |
|
"grad_norm": 1.7834824323654175, |
|
"learning_rate": 5.486716553983951e-05, |
|
"loss": 2.6773, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.9947643979057592, |
|
"grad_norm": 1.8649531602859497, |
|
"learning_rate": 5.4651487912780906e-05, |
|
"loss": 2.8563, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9973821989528796, |
|
"grad_norm": 1.7680840492248535, |
|
"learning_rate": 5.443572294536801e-05, |
|
"loss": 2.6923, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 2.223090648651123, |
|
"learning_rate": 5.4219874688990146e-05, |
|
"loss": 2.8072, |
|
"step": 382 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 764, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 382, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.459853308847718e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|