{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 382, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002617801047120419, "grad_norm": 4.880457401275635, "learning_rate": 2.564102564102564e-06, "loss": 6.8614, "step": 1 }, { "epoch": 0.005235602094240838, "grad_norm": 5.4761762619018555, "learning_rate": 5.128205128205128e-06, "loss": 7.3812, "step": 2 }, { "epoch": 0.007853403141361256, "grad_norm": 4.928334712982178, "learning_rate": 7.692307692307694e-06, "loss": 7.2286, "step": 3 }, { "epoch": 0.010471204188481676, "grad_norm": 5.0216498374938965, "learning_rate": 1.0256410256410256e-05, "loss": 7.3511, "step": 4 }, { "epoch": 0.013089005235602094, "grad_norm": 4.557470798492432, "learning_rate": 1.282051282051282e-05, "loss": 6.6464, "step": 5 }, { "epoch": 0.015706806282722512, "grad_norm": 4.309239864349365, "learning_rate": 1.5384615384615387e-05, "loss": 6.9958, "step": 6 }, { "epoch": 0.01832460732984293, "grad_norm": 4.001352787017822, "learning_rate": 1.794871794871795e-05, "loss": 6.5667, "step": 7 }, { "epoch": 0.020942408376963352, "grad_norm": 3.922077178955078, "learning_rate": 2.0512820512820512e-05, "loss": 6.491, "step": 8 }, { "epoch": 0.02356020942408377, "grad_norm": 3.411221742630005, "learning_rate": 2.307692307692308e-05, "loss": 6.0273, "step": 9 }, { "epoch": 0.02617801047120419, "grad_norm": 3.9540348052978516, "learning_rate": 2.564102564102564e-05, "loss": 6.6163, "step": 10 }, { "epoch": 0.028795811518324606, "grad_norm": 3.2421910762786865, "learning_rate": 2.8205128205128207e-05, "loss": 6.2189, "step": 11 }, { "epoch": 0.031413612565445025, "grad_norm": 3.0702974796295166, "learning_rate": 3.0769230769230774e-05, "loss": 5.5681, "step": 12 }, { "epoch": 0.034031413612565446, "grad_norm": 4.09296178817749, "learning_rate": 3.3333333333333335e-05, "loss": 6.302, "step": 13 }, { "epoch": 0.03664921465968586, "grad_norm": 2.9595351219177246, "learning_rate": 3.58974358974359e-05, "loss": 5.4689, "step": 14 }, { "epoch": 0.03926701570680628, "grad_norm": 3.2727208137512207, "learning_rate": 3.846153846153846e-05, "loss": 5.6809, "step": 15 }, { "epoch": 0.041884816753926704, "grad_norm": 3.616870880126953, "learning_rate": 4.1025641025641023e-05, "loss": 5.7648, "step": 16 }, { "epoch": 0.04450261780104712, "grad_norm": 3.6780197620391846, "learning_rate": 4.358974358974359e-05, "loss": 5.4097, "step": 17 }, { "epoch": 0.04712041884816754, "grad_norm": 3.512361526489258, "learning_rate": 4.615384615384616e-05, "loss": 5.7418, "step": 18 }, { "epoch": 0.049738219895287955, "grad_norm": 4.385097503662109, "learning_rate": 4.871794871794872e-05, "loss": 5.4156, "step": 19 }, { "epoch": 0.05235602094240838, "grad_norm": 4.452427864074707, "learning_rate": 5.128205128205128e-05, "loss": 5.3937, "step": 20 }, { "epoch": 0.0549738219895288, "grad_norm": 3.246995210647583, "learning_rate": 5.384615384615385e-05, "loss": 5.0434, "step": 21 }, { "epoch": 0.05759162303664921, "grad_norm": 3.3380537033081055, "learning_rate": 5.6410256410256414e-05, "loss": 4.999, "step": 22 }, { "epoch": 0.060209424083769635, "grad_norm": 3.313646078109741, "learning_rate": 5.897435897435898e-05, "loss": 5.1477, "step": 23 }, { "epoch": 0.06282722513089005, "grad_norm": 2.918846368789673, "learning_rate": 6.153846153846155e-05, "loss": 4.6583, "step": 24 }, { "epoch": 0.06544502617801047, "grad_norm": 3.070826768875122, "learning_rate": 6.410256410256412e-05, "loss": 4.5479, "step": 25 }, { "epoch": 0.06806282722513089, "grad_norm": 3.04087233543396, "learning_rate": 6.666666666666667e-05, "loss": 4.592, "step": 26 }, { "epoch": 0.07068062827225131, "grad_norm": 2.8428642749786377, "learning_rate": 6.923076923076924e-05, "loss": 4.3682, "step": 27 }, { "epoch": 0.07329842931937172, "grad_norm": 3.0716347694396973, "learning_rate": 7.17948717948718e-05, "loss": 4.5837, "step": 28 }, { "epoch": 0.07591623036649214, "grad_norm": 2.571244478225708, "learning_rate": 7.435897435897436e-05, "loss": 3.992, "step": 29 }, { "epoch": 0.07853403141361257, "grad_norm": 3.7117347717285156, "learning_rate": 7.692307692307693e-05, "loss": 4.5136, "step": 30 }, { "epoch": 0.08115183246073299, "grad_norm": 2.827247381210327, "learning_rate": 7.948717948717948e-05, "loss": 4.4797, "step": 31 }, { "epoch": 0.08376963350785341, "grad_norm": 2.7113707065582275, "learning_rate": 8.205128205128205e-05, "loss": 4.1275, "step": 32 }, { "epoch": 0.08638743455497382, "grad_norm": 2.837117910385132, "learning_rate": 8.461538461538461e-05, "loss": 4.3012, "step": 33 }, { "epoch": 0.08900523560209424, "grad_norm": 3.1402807235717773, "learning_rate": 8.717948717948718e-05, "loss": 4.3305, "step": 34 }, { "epoch": 0.09162303664921466, "grad_norm": 2.8632307052612305, "learning_rate": 8.974358974358975e-05, "loss": 4.0145, "step": 35 }, { "epoch": 0.09424083769633508, "grad_norm": 2.7232565879821777, "learning_rate": 9.230769230769232e-05, "loss": 4.2267, "step": 36 }, { "epoch": 0.0968586387434555, "grad_norm": 2.762054443359375, "learning_rate": 9.487179487179487e-05, "loss": 4.0502, "step": 37 }, { "epoch": 0.09947643979057591, "grad_norm": 18.91065788269043, "learning_rate": 9.743589743589744e-05, "loss": 4.1297, "step": 38 }, { "epoch": 0.10209424083769633, "grad_norm": 3.6653544902801514, "learning_rate": 0.0001, "loss": 4.2813, "step": 39 }, { "epoch": 0.10471204188481675, "grad_norm": 2.7435433864593506, "learning_rate": 9.999953057840867e-05, "loss": 3.7026, "step": 40 }, { "epoch": 0.10732984293193717, "grad_norm": 3.3734006881713867, "learning_rate": 9.999812232244895e-05, "loss": 4.1651, "step": 41 }, { "epoch": 0.1099476439790576, "grad_norm": 2.5766139030456543, "learning_rate": 9.999577525856345e-05, "loss": 4.0622, "step": 42 }, { "epoch": 0.112565445026178, "grad_norm": 3.926454544067383, "learning_rate": 9.99924894308227e-05, "loss": 4.2311, "step": 43 }, { "epoch": 0.11518324607329843, "grad_norm": 3.0207953453063965, "learning_rate": 9.998826490092421e-05, "loss": 3.7921, "step": 44 }, { "epoch": 0.11780104712041885, "grad_norm": 2.495757579803467, "learning_rate": 9.998310174819142e-05, "loss": 3.7601, "step": 45 }, { "epoch": 0.12041884816753927, "grad_norm": 2.8352251052856445, "learning_rate": 9.997700006957214e-05, "loss": 3.6366, "step": 46 }, { "epoch": 0.12303664921465969, "grad_norm": 2.970708131790161, "learning_rate": 9.996995997963675e-05, "loss": 3.9135, "step": 47 }, { "epoch": 0.1256544502617801, "grad_norm": 2.4032399654388428, "learning_rate": 9.996198161057607e-05, "loss": 3.8009, "step": 48 }, { "epoch": 0.12827225130890052, "grad_norm": 3.217522144317627, "learning_rate": 9.995306511219885e-05, "loss": 3.9169, "step": 49 }, { "epoch": 0.13089005235602094, "grad_norm": 2.8503968715667725, "learning_rate": 9.994321065192894e-05, "loss": 3.9316, "step": 50 }, { "epoch": 0.13350785340314136, "grad_norm": 2.826801061630249, "learning_rate": 9.993241841480223e-05, "loss": 3.9417, "step": 51 }, { "epoch": 0.13612565445026178, "grad_norm": 2.5175540447235107, "learning_rate": 9.992068860346306e-05, "loss": 3.5643, "step": 52 }, { "epoch": 0.1387434554973822, "grad_norm": 2.7539591789245605, "learning_rate": 9.990802143816051e-05, "loss": 3.5607, "step": 53 }, { "epoch": 0.14136125654450263, "grad_norm": 2.4108636379241943, "learning_rate": 9.989441715674422e-05, "loss": 3.4459, "step": 54 }, { "epoch": 0.14397905759162305, "grad_norm": 3.2774994373321533, "learning_rate": 9.987987601465991e-05, "loss": 3.9545, "step": 55 }, { "epoch": 0.14659685863874344, "grad_norm": 2.3104467391967773, "learning_rate": 9.986439828494465e-05, "loss": 3.6954, "step": 56 }, { "epoch": 0.14921465968586387, "grad_norm": 2.5438120365142822, "learning_rate": 9.984798425822163e-05, "loss": 3.5669, "step": 57 }, { "epoch": 0.1518324607329843, "grad_norm": 2.4583828449249268, "learning_rate": 9.98306342426948e-05, "loss": 3.8843, "step": 58 }, { "epoch": 0.1544502617801047, "grad_norm": 2.34213924407959, "learning_rate": 9.981234856414307e-05, "loss": 3.7361, "step": 59 }, { "epoch": 0.15706806282722513, "grad_norm": 2.6676721572875977, "learning_rate": 9.979312756591407e-05, "loss": 3.7573, "step": 60 }, { "epoch": 0.15968586387434555, "grad_norm": 2.4997060298919678, "learning_rate": 9.977297160891792e-05, "loss": 3.7876, "step": 61 }, { "epoch": 0.16230366492146597, "grad_norm": 2.709794044494629, "learning_rate": 9.975188107162026e-05, "loss": 3.614, "step": 62 }, { "epoch": 0.1649214659685864, "grad_norm": 2.355778932571411, "learning_rate": 9.972985635003522e-05, "loss": 3.4262, "step": 63 }, { "epoch": 0.16753926701570682, "grad_norm": 2.5319371223449707, "learning_rate": 9.970689785771798e-05, "loss": 3.5963, "step": 64 }, { "epoch": 0.17015706806282724, "grad_norm": 2.559286117553711, "learning_rate": 9.968300602575707e-05, "loss": 3.6722, "step": 65 }, { "epoch": 0.17277486910994763, "grad_norm": 2.4441914558410645, "learning_rate": 9.965818130276612e-05, "loss": 3.4972, "step": 66 }, { "epoch": 0.17539267015706805, "grad_norm": 2.2148141860961914, "learning_rate": 9.963242415487557e-05, "loss": 3.6571, "step": 67 }, { "epoch": 0.17801047120418848, "grad_norm": 2.3289103507995605, "learning_rate": 9.96057350657239e-05, "loss": 3.6085, "step": 68 }, { "epoch": 0.1806282722513089, "grad_norm": 2.6062214374542236, "learning_rate": 9.957811453644847e-05, "loss": 3.4849, "step": 69 }, { "epoch": 0.18324607329842932, "grad_norm": 2.347382068634033, "learning_rate": 9.954956308567622e-05, "loss": 3.3534, "step": 70 }, { "epoch": 0.18586387434554974, "grad_norm": 4.254333972930908, "learning_rate": 9.952008124951381e-05, "loss": 3.6956, "step": 71 }, { "epoch": 0.18848167539267016, "grad_norm": 2.4889872074127197, "learning_rate": 9.948966958153771e-05, "loss": 3.5671, "step": 72 }, { "epoch": 0.19109947643979058, "grad_norm": 34.48236083984375, "learning_rate": 9.945832865278363e-05, "loss": 3.5257, "step": 73 }, { "epoch": 0.193717277486911, "grad_norm": 2.4782514572143555, "learning_rate": 9.942605905173592e-05, "loss": 3.5874, "step": 74 }, { "epoch": 0.19633507853403143, "grad_norm": 2.386019706726074, "learning_rate": 9.939286138431647e-05, "loss": 3.5946, "step": 75 }, { "epoch": 0.19895287958115182, "grad_norm": 2.1345767974853516, "learning_rate": 9.935873627387336e-05, "loss": 3.3744, "step": 76 }, { "epoch": 0.20157068062827224, "grad_norm": 2.562124013900757, "learning_rate": 9.932368436116915e-05, "loss": 3.4642, "step": 77 }, { "epoch": 0.20418848167539266, "grad_norm": 2.3154282569885254, "learning_rate": 9.92877063043688e-05, "loss": 3.4095, "step": 78 }, { "epoch": 0.20680628272251309, "grad_norm": 2.225123643875122, "learning_rate": 9.925080277902743e-05, "loss": 3.496, "step": 79 }, { "epoch": 0.2094240837696335, "grad_norm": 2.0953874588012695, "learning_rate": 9.921297447807744e-05, "loss": 3.4574, "step": 80 }, { "epoch": 0.21204188481675393, "grad_norm": 2.2577614784240723, "learning_rate": 9.917422211181571e-05, "loss": 3.5562, "step": 81 }, { "epoch": 0.21465968586387435, "grad_norm": 2.2811625003814697, "learning_rate": 9.913454640789013e-05, "loss": 3.3894, "step": 82 }, { "epoch": 0.21727748691099477, "grad_norm": 2.139005661010742, "learning_rate": 9.909394811128598e-05, "loss": 3.3137, "step": 83 }, { "epoch": 0.2198952879581152, "grad_norm": 2.058781147003174, "learning_rate": 9.905242798431196e-05, "loss": 3.4472, "step": 84 }, { "epoch": 0.22251308900523561, "grad_norm": 14.535298347473145, "learning_rate": 9.900998680658581e-05, "loss": 3.3115, "step": 85 }, { "epoch": 0.225130890052356, "grad_norm": 2.935969114303589, "learning_rate": 9.896662537501976e-05, "loss": 3.3982, "step": 86 }, { "epoch": 0.22774869109947643, "grad_norm": 2.613450288772583, "learning_rate": 9.892234450380547e-05, "loss": 3.3868, "step": 87 }, { "epoch": 0.23036649214659685, "grad_norm": 2.750882387161255, "learning_rate": 9.887714502439884e-05, "loss": 3.2625, "step": 88 }, { "epoch": 0.23298429319371727, "grad_norm": 2.1589114665985107, "learning_rate": 9.883102778550434e-05, "loss": 3.1647, "step": 89 }, { "epoch": 0.2356020942408377, "grad_norm": 2.105313777923584, "learning_rate": 9.878399365305906e-05, "loss": 3.1404, "step": 90 }, { "epoch": 0.23821989528795812, "grad_norm": 3.4829330444335938, "learning_rate": 9.873604351021648e-05, "loss": 3.3671, "step": 91 }, { "epoch": 0.24083769633507854, "grad_norm": 3.3065099716186523, "learning_rate": 9.868717825732994e-05, "loss": 3.4199, "step": 92 }, { "epoch": 0.24345549738219896, "grad_norm": 5.105870723724365, "learning_rate": 9.863739881193558e-05, "loss": 3.4172, "step": 93 }, { "epoch": 0.24607329842931938, "grad_norm": 3.03316593170166, "learning_rate": 9.858670610873528e-05, "loss": 3.2284, "step": 94 }, { "epoch": 0.2486910994764398, "grad_norm": 2.4118738174438477, "learning_rate": 9.853510109957903e-05, "loss": 3.4009, "step": 95 }, { "epoch": 0.2513089005235602, "grad_norm": 5.970595359802246, "learning_rate": 9.848258475344702e-05, "loss": 3.387, "step": 96 }, { "epoch": 0.25392670157068065, "grad_norm": 2.43656325340271, "learning_rate": 9.842915805643155e-05, "loss": 3.3451, "step": 97 }, { "epoch": 0.25654450261780104, "grad_norm": 3.862488031387329, "learning_rate": 9.837482201171842e-05, "loss": 3.2579, "step": 98 }, { "epoch": 0.2591623036649215, "grad_norm": 2.120502471923828, "learning_rate": 9.831957763956813e-05, "loss": 3.3365, "step": 99 }, { "epoch": 0.2617801047120419, "grad_norm": 2.438363552093506, "learning_rate": 9.826342597729672e-05, "loss": 3.1251, "step": 100 }, { "epoch": 0.2643979057591623, "grad_norm": 2.0495247840881348, "learning_rate": 9.820636807925628e-05, "loss": 3.3333, "step": 101 }, { "epoch": 0.2670157068062827, "grad_norm": 2.2188355922698975, "learning_rate": 9.814840501681522e-05, "loss": 3.1956, "step": 102 }, { "epoch": 0.2696335078534031, "grad_norm": 2.230815887451172, "learning_rate": 9.808953787833801e-05, "loss": 3.1073, "step": 103 }, { "epoch": 0.27225130890052357, "grad_norm": 2.1100032329559326, "learning_rate": 9.802976776916494e-05, "loss": 3.184, "step": 104 }, { "epoch": 0.27486910994764396, "grad_norm": 2.1523215770721436, "learning_rate": 9.796909581159116e-05, "loss": 3.3009, "step": 105 }, { "epoch": 0.2774869109947644, "grad_norm": 2.2816920280456543, "learning_rate": 9.790752314484577e-05, "loss": 3.1125, "step": 106 }, { "epoch": 0.2801047120418848, "grad_norm": 2.134192705154419, "learning_rate": 9.784505092507031e-05, "loss": 3.0986, "step": 107 }, { "epoch": 0.28272251308900526, "grad_norm": 2.156048536300659, "learning_rate": 9.778168032529716e-05, "loss": 3.3802, "step": 108 }, { "epoch": 0.28534031413612565, "grad_norm": 2.2478604316711426, "learning_rate": 9.771741253542741e-05, "loss": 3.2685, "step": 109 }, { "epoch": 0.2879581151832461, "grad_norm": 2.222820997238159, "learning_rate": 9.765224876220859e-05, "loss": 3.1221, "step": 110 }, { "epoch": 0.2905759162303665, "grad_norm": 2.3353006839752197, "learning_rate": 9.758619022921202e-05, "loss": 3.1653, "step": 111 }, { "epoch": 0.2931937172774869, "grad_norm": 14.295672416687012, "learning_rate": 9.751923817680972e-05, "loss": 3.146, "step": 112 }, { "epoch": 0.29581151832460734, "grad_norm": 2.505723714828491, "learning_rate": 9.745139386215128e-05, "loss": 3.2138, "step": 113 }, { "epoch": 0.29842931937172773, "grad_norm": 2.1925199031829834, "learning_rate": 9.738265855914013e-05, "loss": 3.1778, "step": 114 }, { "epoch": 0.3010471204188482, "grad_norm": 2.1371121406555176, "learning_rate": 9.731303355840968e-05, "loss": 3.0539, "step": 115 }, { "epoch": 0.3036649214659686, "grad_norm": 2.133293628692627, "learning_rate": 9.724252016729909e-05, "loss": 3.0521, "step": 116 }, { "epoch": 0.306282722513089, "grad_norm": 2.2210733890533447, "learning_rate": 9.717111970982869e-05, "loss": 3.1743, "step": 117 }, { "epoch": 0.3089005235602094, "grad_norm": 2.2624800205230713, "learning_rate": 9.709883352667513e-05, "loss": 3.2274, "step": 118 }, { "epoch": 0.31151832460732987, "grad_norm": 2.1522083282470703, "learning_rate": 9.70256629751462e-05, "loss": 3.1238, "step": 119 }, { "epoch": 0.31413612565445026, "grad_norm": 1.954613447189331, "learning_rate": 9.69516094291554e-05, "loss": 3.0268, "step": 120 }, { "epoch": 0.31675392670157065, "grad_norm": 1.9362231492996216, "learning_rate": 9.687667427919605e-05, "loss": 3.148, "step": 121 }, { "epoch": 0.3193717277486911, "grad_norm": 2.171438455581665, "learning_rate": 9.680085893231521e-05, "loss": 3.069, "step": 122 }, { "epoch": 0.3219895287958115, "grad_norm": 2.280571699142456, "learning_rate": 9.672416481208738e-05, "loss": 3.3274, "step": 123 }, { "epoch": 0.32460732984293195, "grad_norm": 2.0850417613983154, "learning_rate": 9.664659335858755e-05, "loss": 3.2012, "step": 124 }, { "epoch": 0.32722513089005234, "grad_norm": 2.408496141433716, "learning_rate": 9.656814602836434e-05, "loss": 3.2839, "step": 125 }, { "epoch": 0.3298429319371728, "grad_norm": 120.96231842041016, "learning_rate": 9.648882429441257e-05, "loss": 3.1548, "step": 126 }, { "epoch": 0.3324607329842932, "grad_norm": 2.1636528968811035, "learning_rate": 9.640862964614564e-05, "loss": 3.0179, "step": 127 }, { "epoch": 0.33507853403141363, "grad_norm": 2.2096540927886963, "learning_rate": 9.632756358936749e-05, "loss": 3.261, "step": 128 }, { "epoch": 0.337696335078534, "grad_norm": 2.2390010356903076, "learning_rate": 9.624562764624445e-05, "loss": 3.1954, "step": 129 }, { "epoch": 0.3403141361256545, "grad_norm": 1.8933037519454956, "learning_rate": 9.616282335527653e-05, "loss": 3.2055, "step": 130 }, { "epoch": 0.34293193717277487, "grad_norm": 6.112705230712891, "learning_rate": 9.607915227126862e-05, "loss": 3.2929, "step": 131 }, { "epoch": 0.34554973821989526, "grad_norm": 4.289401054382324, "learning_rate": 9.599461596530127e-05, "loss": 3.1634, "step": 132 }, { "epoch": 0.3481675392670157, "grad_norm": 2.0408754348754883, "learning_rate": 9.590921602470116e-05, "loss": 3.1964, "step": 133 }, { "epoch": 0.3507853403141361, "grad_norm": 2.1104061603546143, "learning_rate": 9.582295405301131e-05, "loss": 3.0235, "step": 134 }, { "epoch": 0.35340314136125656, "grad_norm": 1.9149036407470703, "learning_rate": 9.573583166996103e-05, "loss": 3.2854, "step": 135 }, { "epoch": 0.35602094240837695, "grad_norm": 2.161102056503296, "learning_rate": 9.564785051143541e-05, "loss": 3.1318, "step": 136 }, { "epoch": 0.3586387434554974, "grad_norm": 2.186587333679199, "learning_rate": 9.555901222944468e-05, "loss": 3.2125, "step": 137 }, { "epoch": 0.3612565445026178, "grad_norm": 1.8297839164733887, "learning_rate": 9.546931849209314e-05, "loss": 2.96, "step": 138 }, { "epoch": 0.36387434554973824, "grad_norm": 2.2810025215148926, "learning_rate": 9.537877098354786e-05, "loss": 3.0554, "step": 139 }, { "epoch": 0.36649214659685864, "grad_norm": 2.006052017211914, "learning_rate": 9.528737140400707e-05, "loss": 3.1757, "step": 140 }, { "epoch": 0.36910994764397903, "grad_norm": 2.3248322010040283, "learning_rate": 9.519512146966823e-05, "loss": 3.2321, "step": 141 }, { "epoch": 0.3717277486910995, "grad_norm": 2.1551225185394287, "learning_rate": 9.510202291269576e-05, "loss": 3.0333, "step": 142 }, { "epoch": 0.3743455497382199, "grad_norm": 2.613978147506714, "learning_rate": 9.500807748118856e-05, "loss": 3.3177, "step": 143 }, { "epoch": 0.3769633507853403, "grad_norm": 2.302820920944214, "learning_rate": 9.491328693914722e-05, "loss": 3.069, "step": 144 }, { "epoch": 0.3795811518324607, "grad_norm": 2.0801970958709717, "learning_rate": 9.48176530664408e-05, "loss": 3.1207, "step": 145 }, { "epoch": 0.38219895287958117, "grad_norm": 1.8670393228530884, "learning_rate": 9.472117765877349e-05, "loss": 2.988, "step": 146 }, { "epoch": 0.38481675392670156, "grad_norm": 2.123073101043701, "learning_rate": 9.462386252765087e-05, "loss": 3.1316, "step": 147 }, { "epoch": 0.387434554973822, "grad_norm": 2.0116608142852783, "learning_rate": 9.452570950034589e-05, "loss": 3.1323, "step": 148 }, { "epoch": 0.3900523560209424, "grad_norm": 1.8206120729446411, "learning_rate": 9.442672041986457e-05, "loss": 2.8792, "step": 149 }, { "epoch": 0.39267015706806285, "grad_norm": 1.9137699604034424, "learning_rate": 9.432689714491136e-05, "loss": 3.0191, "step": 150 }, { "epoch": 0.39528795811518325, "grad_norm": 1.9897340536117554, "learning_rate": 9.422624154985427e-05, "loss": 3.1578, "step": 151 }, { "epoch": 0.39790575916230364, "grad_norm": 1.8997538089752197, "learning_rate": 9.412475552468974e-05, "loss": 3.0389, "step": 152 }, { "epoch": 0.4005235602094241, "grad_norm": 2.249044895172119, "learning_rate": 9.402244097500696e-05, "loss": 3.3441, "step": 153 }, { "epoch": 0.4031413612565445, "grad_norm": 1.9846928119659424, "learning_rate": 9.391929982195232e-05, "loss": 3.2146, "step": 154 }, { "epoch": 0.40575916230366493, "grad_norm": 2.240514039993286, "learning_rate": 9.381533400219318e-05, "loss": 3.2069, "step": 155 }, { "epoch": 0.4083769633507853, "grad_norm": 2.115003824234009, "learning_rate": 9.371054546788157e-05, "loss": 2.9426, "step": 156 }, { "epoch": 0.4109947643979058, "grad_norm": 1.9737358093261719, "learning_rate": 9.36049361866175e-05, "loss": 3.1503, "step": 157 }, { "epoch": 0.41361256544502617, "grad_norm": 2.096264600753784, "learning_rate": 9.349850814141204e-05, "loss": 3.0736, "step": 158 }, { "epoch": 0.4162303664921466, "grad_norm": 2.3380467891693115, "learning_rate": 9.339126333065007e-05, "loss": 3.2136, "step": 159 }, { "epoch": 0.418848167539267, "grad_norm": 2.1407546997070312, "learning_rate": 9.328320376805281e-05, "loss": 3.1564, "step": 160 }, { "epoch": 0.4214659685863874, "grad_norm": 1.9080549478530884, "learning_rate": 9.317433148263995e-05, "loss": 2.9173, "step": 161 }, { "epoch": 0.42408376963350786, "grad_norm": 4.025334358215332, "learning_rate": 9.30646485186915e-05, "loss": 2.8801, "step": 162 }, { "epoch": 0.42670157068062825, "grad_norm": 2.0670013427734375, "learning_rate": 9.295415693570955e-05, "loss": 2.9837, "step": 163 }, { "epoch": 0.4293193717277487, "grad_norm": 2.1820228099823, "learning_rate": 9.284285880837946e-05, "loss": 3.1169, "step": 164 }, { "epoch": 0.4319371727748691, "grad_norm": 1.9107846021652222, "learning_rate": 9.273075622653102e-05, "loss": 3.1854, "step": 165 }, { "epoch": 0.43455497382198954, "grad_norm": 2.0229499340057373, "learning_rate": 9.261785129509914e-05, "loss": 2.9287, "step": 166 }, { "epoch": 0.43717277486910994, "grad_norm": 1.9189234972000122, "learning_rate": 9.250414613408427e-05, "loss": 3.0001, "step": 167 }, { "epoch": 0.4397905759162304, "grad_norm": 2.0910720825195312, "learning_rate": 9.238964287851275e-05, "loss": 2.9357, "step": 168 }, { "epoch": 0.4424083769633508, "grad_norm": 1.9052855968475342, "learning_rate": 9.22743436783966e-05, "loss": 2.9641, "step": 169 }, { "epoch": 0.44502617801047123, "grad_norm": 1.9111841917037964, "learning_rate": 9.215825069869316e-05, "loss": 3.0879, "step": 170 }, { "epoch": 0.4476439790575916, "grad_norm": 1.898743987083435, "learning_rate": 9.20413661192645e-05, "loss": 2.9687, "step": 171 }, { "epoch": 0.450261780104712, "grad_norm": 1.9681291580200195, "learning_rate": 9.192369213483642e-05, "loss": 3.0313, "step": 172 }, { "epoch": 0.45287958115183247, "grad_norm": 1.937829613685608, "learning_rate": 9.180523095495727e-05, "loss": 2.9251, "step": 173 }, { "epoch": 0.45549738219895286, "grad_norm": 2.15289568901062, "learning_rate": 9.168598480395651e-05, "loss": 3.0487, "step": 174 }, { "epoch": 0.4581151832460733, "grad_norm": 2.0502867698669434, "learning_rate": 9.156595592090284e-05, "loss": 2.8817, "step": 175 }, { "epoch": 0.4607329842931937, "grad_norm": 2.0720651149749756, "learning_rate": 9.14451465595622e-05, "loss": 3.009, "step": 176 }, { "epoch": 0.46335078534031415, "grad_norm": 2.0610387325286865, "learning_rate": 9.132355898835556e-05, "loss": 3.0353, "step": 177 }, { "epoch": 0.46596858638743455, "grad_norm": 2.160541534423828, "learning_rate": 9.12011954903161e-05, "loss": 3.1546, "step": 178 }, { "epoch": 0.468586387434555, "grad_norm": 2.1311235427856445, "learning_rate": 9.107805836304658e-05, "loss": 3.0889, "step": 179 }, { "epoch": 0.4712041884816754, "grad_norm": 1.902626633644104, "learning_rate": 9.095414991867604e-05, "loss": 2.9551, "step": 180 }, { "epoch": 0.4738219895287958, "grad_norm": 1.9568802118301392, "learning_rate": 9.082947248381643e-05, "loss": 2.9683, "step": 181 }, { "epoch": 0.47643979057591623, "grad_norm": 2.024705171585083, "learning_rate": 9.070402839951897e-05, "loss": 3.0207, "step": 182 }, { "epoch": 0.4790575916230366, "grad_norm": 1.8830217123031616, "learning_rate": 9.057782002123012e-05, "loss": 2.8518, "step": 183 }, { "epoch": 0.4816753926701571, "grad_norm": 2.031515598297119, "learning_rate": 9.045084971874738e-05, "loss": 2.9262, "step": 184 }, { "epoch": 0.48429319371727747, "grad_norm": 1.9783765077590942, "learning_rate": 9.03231198761748e-05, "loss": 2.9476, "step": 185 }, { "epoch": 0.4869109947643979, "grad_norm": 1.9431626796722412, "learning_rate": 9.019463289187827e-05, "loss": 2.8431, "step": 186 }, { "epoch": 0.4895287958115183, "grad_norm": 1.8779281377792358, "learning_rate": 9.00653911784403e-05, "loss": 3.0361, "step": 187 }, { "epoch": 0.49214659685863876, "grad_norm": 1.810121774673462, "learning_rate": 8.993539716261498e-05, "loss": 2.847, "step": 188 }, { "epoch": 0.49476439790575916, "grad_norm": 2.5064663887023926, "learning_rate": 8.980465328528219e-05, "loss": 2.9118, "step": 189 }, { "epoch": 0.4973821989528796, "grad_norm": 1.9613292217254639, "learning_rate": 8.96731620014019e-05, "loss": 2.8882, "step": 190 }, { "epoch": 0.5, "grad_norm": 2.058799982070923, "learning_rate": 8.954092577996803e-05, "loss": 3.0066, "step": 191 }, { "epoch": 0.5026178010471204, "grad_norm": 6.452920436859131, "learning_rate": 8.940794710396205e-05, "loss": 3.005, "step": 192 }, { "epoch": 0.5052356020942408, "grad_norm": 2.1949808597564697, "learning_rate": 8.927422847030646e-05, "loss": 2.9602, "step": 193 }, { "epoch": 0.5078534031413613, "grad_norm": 2.0062756538391113, "learning_rate": 8.913977238981778e-05, "loss": 2.9961, "step": 194 }, { "epoch": 0.5104712041884817, "grad_norm": 1.8738442659378052, "learning_rate": 8.900458138715954e-05, "loss": 3.0411, "step": 195 }, { "epoch": 0.5130890052356021, "grad_norm": 1.8305824995040894, "learning_rate": 8.886865800079474e-05, "loss": 3.0892, "step": 196 }, { "epoch": 0.5157068062827225, "grad_norm": 1.8090300559997559, "learning_rate": 8.873200478293826e-05, "loss": 2.7575, "step": 197 }, { "epoch": 0.518324607329843, "grad_norm": 1.870323657989502, "learning_rate": 8.859462429950897e-05, "loss": 3.054, "step": 198 }, { "epoch": 0.5209424083769634, "grad_norm": 1.965692162513733, "learning_rate": 8.845651913008145e-05, "loss": 2.8143, "step": 199 }, { "epoch": 0.5235602094240838, "grad_norm": 1.88938570022583, "learning_rate": 8.831769186783765e-05, "loss": 2.9259, "step": 200 }, { "epoch": 0.5261780104712042, "grad_norm": 1.8629285097122192, "learning_rate": 8.817814511951814e-05, "loss": 2.8782, "step": 201 }, { "epoch": 0.5287958115183246, "grad_norm": 1.873532772064209, "learning_rate": 8.80378815053732e-05, "loss": 2.8025, "step": 202 }, { "epoch": 0.5314136125654451, "grad_norm": 2.1665189266204834, "learning_rate": 8.789690365911356e-05, "loss": 3.0567, "step": 203 }, { "epoch": 0.5340314136125655, "grad_norm": 1.980928659439087, "learning_rate": 8.775521422786104e-05, "loss": 2.9795, "step": 204 }, { "epoch": 0.5366492146596858, "grad_norm": 2.0913095474243164, "learning_rate": 8.761281587209876e-05, "loss": 3.0988, "step": 205 }, { "epoch": 0.5392670157068062, "grad_norm": 1.8205846548080444, "learning_rate": 8.746971126562124e-05, "loss": 2.9828, "step": 206 }, { "epoch": 0.5418848167539267, "grad_norm": 2.1041507720947266, "learning_rate": 8.732590309548416e-05, "loss": 3.0541, "step": 207 }, { "epoch": 0.5445026178010471, "grad_norm": 2.023577928543091, "learning_rate": 8.718139406195393e-05, "loss": 3.0061, "step": 208 }, { "epoch": 0.5471204188481675, "grad_norm": 1.8006081581115723, "learning_rate": 8.703618687845696e-05, "loss": 2.789, "step": 209 }, { "epoch": 0.5497382198952879, "grad_norm": 1.840972900390625, "learning_rate": 8.689028427152874e-05, "loss": 2.9437, "step": 210 }, { "epoch": 0.5523560209424084, "grad_norm": 1.9612823724746704, "learning_rate": 8.674368898076261e-05, "loss": 3.1407, "step": 211 }, { "epoch": 0.5549738219895288, "grad_norm": 2.0491254329681396, "learning_rate": 8.65964037587584e-05, "loss": 2.9557, "step": 212 }, { "epoch": 0.5575916230366492, "grad_norm": 2.0597126483917236, "learning_rate": 8.644843137107059e-05, "loss": 2.9996, "step": 213 }, { "epoch": 0.5602094240837696, "grad_norm": 1.7746310234069824, "learning_rate": 8.629977459615655e-05, "loss": 3.0459, "step": 214 }, { "epoch": 0.56282722513089, "grad_norm": 1.892563819885254, "learning_rate": 8.615043622532429e-05, "loss": 2.9831, "step": 215 }, { "epoch": 0.5654450261780105, "grad_norm": 1.7289716005325317, "learning_rate": 8.600041906268e-05, "loss": 2.7416, "step": 216 }, { "epoch": 0.5680628272251309, "grad_norm": 1.929553508758545, "learning_rate": 8.584972592507553e-05, "loss": 2.9513, "step": 217 }, { "epoch": 0.5706806282722513, "grad_norm": 1.957163691520691, "learning_rate": 8.569835964205536e-05, "loss": 2.7647, "step": 218 }, { "epoch": 0.5732984293193717, "grad_norm": 1.8348171710968018, "learning_rate": 8.554632305580354e-05, "loss": 2.9897, "step": 219 }, { "epoch": 0.5759162303664922, "grad_norm": 1.811667561531067, "learning_rate": 8.539361902109033e-05, "loss": 2.8322, "step": 220 }, { "epoch": 0.5785340314136126, "grad_norm": 2.0632617473602295, "learning_rate": 8.524025040521856e-05, "loss": 2.9788, "step": 221 }, { "epoch": 0.581151832460733, "grad_norm": 1.847805142402649, "learning_rate": 8.508622008796985e-05, "loss": 2.8168, "step": 222 }, { "epoch": 0.5837696335078534, "grad_norm": 1.7859346866607666, "learning_rate": 8.493153096155042e-05, "loss": 2.819, "step": 223 }, { "epoch": 0.5863874345549738, "grad_norm": 1.756142020225525, "learning_rate": 8.477618593053693e-05, "loss": 2.9083, "step": 224 }, { "epoch": 0.5890052356020943, "grad_norm": 1.7891409397125244, "learning_rate": 8.462018791182184e-05, "loss": 2.911, "step": 225 }, { "epoch": 0.5916230366492147, "grad_norm": 1.923041582107544, "learning_rate": 8.44635398345587e-05, "loss": 2.7662, "step": 226 }, { "epoch": 0.5942408376963351, "grad_norm": 2.131024122238159, "learning_rate": 8.430624464010706e-05, "loss": 2.8831, "step": 227 }, { "epoch": 0.5968586387434555, "grad_norm": 1.9196547269821167, "learning_rate": 8.414830528197737e-05, "loss": 2.875, "step": 228 }, { "epoch": 0.599476439790576, "grad_norm": 1.9374908208847046, "learning_rate": 8.39897247257754e-05, "loss": 3.0421, "step": 229 }, { "epoch": 0.6020942408376964, "grad_norm": 1.9606311321258545, "learning_rate": 8.383050594914665e-05, "loss": 2.9347, "step": 230 }, { "epoch": 0.6047120418848168, "grad_norm": 1.8933106660842896, "learning_rate": 8.367065194172037e-05, "loss": 2.9623, "step": 231 }, { "epoch": 0.6073298429319371, "grad_norm": 2.1029906272888184, "learning_rate": 8.351016570505347e-05, "loss": 2.8704, "step": 232 }, { "epoch": 0.6099476439790575, "grad_norm": 1.86416494846344, "learning_rate": 8.334905025257413e-05, "loss": 3.0935, "step": 233 }, { "epoch": 0.612565445026178, "grad_norm": 1.8532062768936157, "learning_rate": 8.318730860952522e-05, "loss": 2.7635, "step": 234 }, { "epoch": 0.6151832460732984, "grad_norm": 2.0105836391448975, "learning_rate": 8.302494381290756e-05, "loss": 3.0954, "step": 235 }, { "epoch": 0.6178010471204188, "grad_norm": 2.164985418319702, "learning_rate": 8.286195891142274e-05, "loss": 2.9424, "step": 236 }, { "epoch": 0.6204188481675392, "grad_norm": 1.9459847211837769, "learning_rate": 8.269835696541607e-05, "loss": 2.8438, "step": 237 }, { "epoch": 0.6230366492146597, "grad_norm": 3.4676437377929688, "learning_rate": 8.253414104681898e-05, "loss": 3.0494, "step": 238 }, { "epoch": 0.6256544502617801, "grad_norm": 1.8192616701126099, "learning_rate": 8.236931423909138e-05, "loss": 2.7965, "step": 239 }, { "epoch": 0.6282722513089005, "grad_norm": 1.8246678113937378, "learning_rate": 8.220387963716377e-05, "loss": 2.8788, "step": 240 }, { "epoch": 0.6308900523560209, "grad_norm": 1.8622642755508423, "learning_rate": 8.20378403473791e-05, "loss": 3.0934, "step": 241 }, { "epoch": 0.6335078534031413, "grad_norm": 1.927139163017273, "learning_rate": 8.18711994874345e-05, "loss": 2.9704, "step": 242 }, { "epoch": 0.6361256544502618, "grad_norm": 1.7952053546905518, "learning_rate": 8.170396018632264e-05, "loss": 2.8313, "step": 243 }, { "epoch": 0.6387434554973822, "grad_norm": 1.8302192687988281, "learning_rate": 8.153612558427311e-05, "loss": 2.8198, "step": 244 }, { "epoch": 0.6413612565445026, "grad_norm": 1.809410810470581, "learning_rate": 8.13676988326933e-05, "loss": 2.6532, "step": 245 }, { "epoch": 0.643979057591623, "grad_norm": 1.8222423791885376, "learning_rate": 8.119868309410943e-05, "loss": 2.9867, "step": 246 }, { "epoch": 0.6465968586387435, "grad_norm": 7.688499927520752, "learning_rate": 8.102908154210693e-05, "loss": 2.8391, "step": 247 }, { "epoch": 0.6492146596858639, "grad_norm": 1.8286641836166382, "learning_rate": 8.085889736127103e-05, "loss": 2.9778, "step": 248 }, { "epoch": 0.6518324607329843, "grad_norm": 1.816272258758545, "learning_rate": 8.068813374712688e-05, "loss": 2.8969, "step": 249 }, { "epoch": 0.6544502617801047, "grad_norm": 2.080170154571533, "learning_rate": 8.05167939060796e-05, "loss": 2.9861, "step": 250 }, { "epoch": 0.6570680628272252, "grad_norm": 1.8223365545272827, "learning_rate": 8.0344881055354e-05, "loss": 2.9545, "step": 251 }, { "epoch": 0.6596858638743456, "grad_norm": 1.891790509223938, "learning_rate": 8.017239842293427e-05, "loss": 2.8586, "step": 252 }, { "epoch": 0.662303664921466, "grad_norm": 1.8906011581420898, "learning_rate": 7.999934924750325e-05, "loss": 2.94, "step": 253 }, { "epoch": 0.6649214659685864, "grad_norm": 1.8247802257537842, "learning_rate": 7.982573677838172e-05, "loss": 2.6747, "step": 254 }, { "epoch": 0.6675392670157068, "grad_norm": 1.8509819507598877, "learning_rate": 7.965156427546735e-05, "loss": 2.8795, "step": 255 }, { "epoch": 0.6701570680628273, "grad_norm": 1.796658992767334, "learning_rate": 7.947683500917347e-05, "loss": 2.8549, "step": 256 }, { "epoch": 0.6727748691099477, "grad_norm": 1.771183729171753, "learning_rate": 7.93015522603677e-05, "loss": 2.9275, "step": 257 }, { "epoch": 0.675392670157068, "grad_norm": 1.8382376432418823, "learning_rate": 7.91257193203103e-05, "loss": 3.0237, "step": 258 }, { "epoch": 0.6780104712041884, "grad_norm": 1.8723816871643066, "learning_rate": 7.894933949059245e-05, "loss": 2.6623, "step": 259 }, { "epoch": 0.680628272251309, "grad_norm": 1.9213804006576538, "learning_rate": 7.877241608307411e-05, "loss": 2.8994, "step": 260 }, { "epoch": 0.6832460732984293, "grad_norm": 1.7478219270706177, "learning_rate": 7.8594952419822e-05, "loss": 2.7949, "step": 261 }, { "epoch": 0.6858638743455497, "grad_norm": 1.8188650608062744, "learning_rate": 7.841695183304713e-05, "loss": 2.8178, "step": 262 }, { "epoch": 0.6884816753926701, "grad_norm": 1.8901054859161377, "learning_rate": 7.823841766504227e-05, "loss": 2.9314, "step": 263 }, { "epoch": 0.6910994764397905, "grad_norm": 1.9486525058746338, "learning_rate": 7.805935326811912e-05, "loss": 2.7373, "step": 264 }, { "epoch": 0.693717277486911, "grad_norm": 1.7464019060134888, "learning_rate": 7.787976200454546e-05, "loss": 2.8273, "step": 265 }, { "epoch": 0.6963350785340314, "grad_norm": 1.8585435152053833, "learning_rate": 7.769964724648196e-05, "loss": 2.8128, "step": 266 }, { "epoch": 0.6989528795811518, "grad_norm": 1.7956280708312988, "learning_rate": 7.751901237591887e-05, "loss": 2.7562, "step": 267 }, { "epoch": 0.7015706806282722, "grad_norm": 2.0384879112243652, "learning_rate": 7.733786078461252e-05, "loss": 2.9731, "step": 268 }, { "epoch": 0.7041884816753927, "grad_norm": 1.7548547983169556, "learning_rate": 7.715619587402164e-05, "loss": 2.834, "step": 269 }, { "epoch": 0.7068062827225131, "grad_norm": 1.651876449584961, "learning_rate": 7.697402105524351e-05, "loss": 2.7768, "step": 270 }, { "epoch": 0.7094240837696335, "grad_norm": 1.8058605194091797, "learning_rate": 7.679133974894983e-05, "loss": 2.7503, "step": 271 }, { "epoch": 0.7120418848167539, "grad_norm": 1.921723484992981, "learning_rate": 7.66081553853226e-05, "loss": 2.7922, "step": 272 }, { "epoch": 0.7146596858638743, "grad_norm": 1.990212321281433, "learning_rate": 7.642447140398965e-05, "loss": 2.8968, "step": 273 }, { "epoch": 0.7172774869109948, "grad_norm": 1.8311606645584106, "learning_rate": 7.624029125396004e-05, "loss": 2.9963, "step": 274 }, { "epoch": 0.7198952879581152, "grad_norm": 2.0072896480560303, "learning_rate": 7.605561839355933e-05, "loss": 2.875, "step": 275 }, { "epoch": 0.7225130890052356, "grad_norm": 1.9953948259353638, "learning_rate": 7.587045629036463e-05, "loss": 2.9325, "step": 276 }, { "epoch": 0.725130890052356, "grad_norm": 1.8839069604873657, "learning_rate": 7.568480842113952e-05, "loss": 2.682, "step": 277 }, { "epoch": 0.7277486910994765, "grad_norm": 1.6751598119735718, "learning_rate": 7.549867827176873e-05, "loss": 2.7488, "step": 278 }, { "epoch": 0.7303664921465969, "grad_norm": 1.7167940139770508, "learning_rate": 7.53120693371927e-05, "loss": 2.9112, "step": 279 }, { "epoch": 0.7329842931937173, "grad_norm": 1.9378173351287842, "learning_rate": 7.512498512134194e-05, "loss": 2.9304, "step": 280 }, { "epoch": 0.7356020942408377, "grad_norm": 1.7746607065200806, "learning_rate": 7.493742913707127e-05, "loss": 2.9898, "step": 281 }, { "epoch": 0.7382198952879581, "grad_norm": 1.9426668882369995, "learning_rate": 7.474940490609383e-05, "loss": 2.9828, "step": 282 }, { "epoch": 0.7408376963350786, "grad_norm": 1.8619779348373413, "learning_rate": 7.456091595891498e-05, "loss": 2.7965, "step": 283 }, { "epoch": 0.743455497382199, "grad_norm": 1.691998839378357, "learning_rate": 7.437196583476596e-05, "loss": 2.8217, "step": 284 }, { "epoch": 0.7460732984293194, "grad_norm": 1.8032022714614868, "learning_rate": 7.41825580815375e-05, "loss": 2.7456, "step": 285 }, { "epoch": 0.7486910994764397, "grad_norm": 1.8445478677749634, "learning_rate": 7.399269625571316e-05, "loss": 2.855, "step": 286 }, { "epoch": 0.7513089005235603, "grad_norm": 1.69001305103302, "learning_rate": 7.380238392230257e-05, "loss": 2.6786, "step": 287 }, { "epoch": 0.7539267015706806, "grad_norm": 1.8098856210708618, "learning_rate": 7.361162465477442e-05, "loss": 2.8392, "step": 288 }, { "epoch": 0.756544502617801, "grad_norm": 2.023061752319336, "learning_rate": 7.342042203498951e-05, "loss": 3.1742, "step": 289 }, { "epoch": 0.7591623036649214, "grad_norm": 2.027721405029297, "learning_rate": 7.322877965313335e-05, "loss": 2.707, "step": 290 }, { "epoch": 0.7617801047120419, "grad_norm": 2.092510938644409, "learning_rate": 7.303670110764881e-05, "loss": 2.9764, "step": 291 }, { "epoch": 0.7643979057591623, "grad_norm": 1.794023871421814, "learning_rate": 7.284419000516855e-05, "loss": 2.8471, "step": 292 }, { "epoch": 0.7670157068062827, "grad_norm": 1.998582124710083, "learning_rate": 7.26512499604473e-05, "loss": 2.8464, "step": 293 }, { "epoch": 0.7696335078534031, "grad_norm": 1.9297953844070435, "learning_rate": 7.245788459629396e-05, "loss": 2.6567, "step": 294 }, { "epoch": 0.7722513089005235, "grad_norm": 1.889047622680664, "learning_rate": 7.226409754350361e-05, "loss": 2.8468, "step": 295 }, { "epoch": 0.774869109947644, "grad_norm": 1.806944489479065, "learning_rate": 7.206989244078934e-05, "loss": 2.7286, "step": 296 }, { "epoch": 0.7774869109947644, "grad_norm": 1.7581151723861694, "learning_rate": 7.187527293471385e-05, "loss": 2.7098, "step": 297 }, { "epoch": 0.7801047120418848, "grad_norm": 1.8031219244003296, "learning_rate": 7.168024267962111e-05, "loss": 2.746, "step": 298 }, { "epoch": 0.7827225130890052, "grad_norm": 1.743693470954895, "learning_rate": 7.14848053375676e-05, "loss": 2.9825, "step": 299 }, { "epoch": 0.7853403141361257, "grad_norm": 1.8501001596450806, "learning_rate": 7.128896457825364e-05, "loss": 2.7572, "step": 300 }, { "epoch": 0.7879581151832461, "grad_norm": 1.8102787733078003, "learning_rate": 7.109272407895449e-05, "loss": 2.7863, "step": 301 }, { "epoch": 0.7905759162303665, "grad_norm": 1.8646475076675415, "learning_rate": 7.089608752445121e-05, "loss": 2.8223, "step": 302 }, { "epoch": 0.7931937172774869, "grad_norm": 1.9499090909957886, "learning_rate": 7.069905860696162e-05, "loss": 2.7525, "step": 303 }, { "epoch": 0.7958115183246073, "grad_norm": 2.4195919036865234, "learning_rate": 7.05016410260708e-05, "loss": 2.7437, "step": 304 }, { "epoch": 0.7984293193717278, "grad_norm": 2.0008833408355713, "learning_rate": 7.030383848866177e-05, "loss": 2.8387, "step": 305 }, { "epoch": 0.8010471204188482, "grad_norm": 2.05761456489563, "learning_rate": 7.010565470884582e-05, "loss": 2.8233, "step": 306 }, { "epoch": 0.8036649214659686, "grad_norm": 1.608881950378418, "learning_rate": 6.990709340789273e-05, "loss": 2.8514, "step": 307 }, { "epoch": 0.806282722513089, "grad_norm": 1.8444130420684814, "learning_rate": 6.970815831416099e-05, "loss": 2.784, "step": 308 }, { "epoch": 0.8089005235602095, "grad_norm": 1.946343183517456, "learning_rate": 6.950885316302773e-05, "loss": 2.6383, "step": 309 }, { "epoch": 0.8115183246073299, "grad_norm": 1.7848812341690063, "learning_rate": 6.93091816968186e-05, "loss": 2.7524, "step": 310 }, { "epoch": 0.8141361256544503, "grad_norm": 1.8199461698532104, "learning_rate": 6.910914766473749e-05, "loss": 2.7292, "step": 311 }, { "epoch": 0.8167539267015707, "grad_norm": 1.8317044973373413, "learning_rate": 6.890875482279614e-05, "loss": 2.7381, "step": 312 }, { "epoch": 0.819371727748691, "grad_norm": 1.7864313125610352, "learning_rate": 6.870800693374364e-05, "loss": 2.7642, "step": 313 }, { "epoch": 0.8219895287958116, "grad_norm": 1.8122559785842896, "learning_rate": 6.850690776699573e-05, "loss": 2.6585, "step": 314 }, { "epoch": 0.824607329842932, "grad_norm": 1.8325432538986206, "learning_rate": 6.830546109856401e-05, "loss": 2.7378, "step": 315 }, { "epoch": 0.8272251308900523, "grad_norm": 1.7728067636489868, "learning_rate": 6.810367071098516e-05, "loss": 2.8454, "step": 316 }, { "epoch": 0.8298429319371727, "grad_norm": 1.841179609298706, "learning_rate": 6.790154039324975e-05, "loss": 2.6204, "step": 317 }, { "epoch": 0.8324607329842932, "grad_norm": 1.6985840797424316, "learning_rate": 6.769907394073117e-05, "loss": 2.905, "step": 318 }, { "epoch": 0.8350785340314136, "grad_norm": 1.8315932750701904, "learning_rate": 6.749627515511442e-05, "loss": 2.7492, "step": 319 }, { "epoch": 0.837696335078534, "grad_norm": 1.8084025382995605, "learning_rate": 6.729314784432465e-05, "loss": 2.752, "step": 320 }, { "epoch": 0.8403141361256544, "grad_norm": 1.8621736764907837, "learning_rate": 6.708969582245568e-05, "loss": 2.6648, "step": 321 }, { "epoch": 0.8429319371727748, "grad_norm": 1.7286403179168701, "learning_rate": 6.688592290969837e-05, "loss": 2.5931, "step": 322 }, { "epoch": 0.8455497382198953, "grad_norm": 1.67673921585083, "learning_rate": 6.668183293226891e-05, "loss": 2.8364, "step": 323 }, { "epoch": 0.8481675392670157, "grad_norm": 1.9970136880874634, "learning_rate": 6.647742972233703e-05, "loss": 2.888, "step": 324 }, { "epoch": 0.8507853403141361, "grad_norm": 1.8752323389053345, "learning_rate": 6.627271711795386e-05, "loss": 3.0591, "step": 325 }, { "epoch": 0.8534031413612565, "grad_norm": 1.7339110374450684, "learning_rate": 6.606769896298014e-05, "loss": 2.8724, "step": 326 }, { "epoch": 0.856020942408377, "grad_norm": 1.8849775791168213, "learning_rate": 6.586237910701374e-05, "loss": 2.8541, "step": 327 }, { "epoch": 0.8586387434554974, "grad_norm": 1.8441506624221802, "learning_rate": 6.565676140531764e-05, "loss": 2.9447, "step": 328 }, { "epoch": 0.8612565445026178, "grad_norm": 1.7712169885635376, "learning_rate": 6.545084971874738e-05, "loss": 2.6989, "step": 329 }, { "epoch": 0.8638743455497382, "grad_norm": 1.7494961023330688, "learning_rate": 6.524464791367861e-05, "loss": 2.7702, "step": 330 }, { "epoch": 0.8664921465968587, "grad_norm": 2.063343048095703, "learning_rate": 6.503815986193456e-05, "loss": 2.746, "step": 331 }, { "epoch": 0.8691099476439791, "grad_norm": 1.89371919631958, "learning_rate": 6.483138944071316e-05, "loss": 2.7176, "step": 332 }, { "epoch": 0.8717277486910995, "grad_norm": 2.864008665084839, "learning_rate": 6.462434053251446e-05, "loss": 2.6897, "step": 333 }, { "epoch": 0.8743455497382199, "grad_norm": 1.761522650718689, "learning_rate": 6.441701702506754e-05, "loss": 2.5764, "step": 334 }, { "epoch": 0.8769633507853403, "grad_norm": 1.726992130279541, "learning_rate": 6.420942281125765e-05, "loss": 2.6313, "step": 335 }, { "epoch": 0.8795811518324608, "grad_norm": 1.711841106414795, "learning_rate": 6.400156178905308e-05, "loss": 2.7067, "step": 336 }, { "epoch": 0.8821989528795812, "grad_norm": 1.8135813474655151, "learning_rate": 6.379343786143184e-05, "loss": 2.78, "step": 337 }, { "epoch": 0.8848167539267016, "grad_norm": 1.6920337677001953, "learning_rate": 6.358505493630858e-05, "loss": 2.8199, "step": 338 }, { "epoch": 0.887434554973822, "grad_norm": 1.7479579448699951, "learning_rate": 6.337641692646106e-05, "loss": 2.8199, "step": 339 }, { "epoch": 0.8900523560209425, "grad_norm": 1.778996467590332, "learning_rate": 6.316752774945673e-05, "loss": 2.6521, "step": 340 }, { "epoch": 0.8926701570680629, "grad_norm": 1.964356541633606, "learning_rate": 6.295839132757919e-05, "loss": 2.7588, "step": 341 }, { "epoch": 0.8952879581151832, "grad_norm": 1.6494593620300293, "learning_rate": 6.274901158775454e-05, "loss": 2.7191, "step": 342 }, { "epoch": 0.8979057591623036, "grad_norm": 1.7057850360870361, "learning_rate": 6.25393924614776e-05, "loss": 2.5695, "step": 343 }, { "epoch": 0.900523560209424, "grad_norm": 1.83281672000885, "learning_rate": 6.232953788473811e-05, "loss": 2.7329, "step": 344 }, { "epoch": 0.9031413612565445, "grad_norm": 1.7066670656204224, "learning_rate": 6.211945179794684e-05, "loss": 2.6925, "step": 345 }, { "epoch": 0.9057591623036649, "grad_norm": 1.7664719820022583, "learning_rate": 6.190913814586162e-05, "loss": 2.8158, "step": 346 }, { "epoch": 0.9083769633507853, "grad_norm": 1.8208638429641724, "learning_rate": 6.169860087751321e-05, "loss": 2.6722, "step": 347 }, { "epoch": 0.9109947643979057, "grad_norm": 1.891035556793213, "learning_rate": 6.148784394613119e-05, "loss": 2.7744, "step": 348 }, { "epoch": 0.9136125654450262, "grad_norm": 1.7416268587112427, "learning_rate": 6.127687130906972e-05, "loss": 2.5742, "step": 349 }, { "epoch": 0.9162303664921466, "grad_norm": 2.1573400497436523, "learning_rate": 6.106568692773324e-05, "loss": 2.7032, "step": 350 }, { "epoch": 0.918848167539267, "grad_norm": 1.9572385549545288, "learning_rate": 6.0854294767502084e-05, "loss": 2.7508, "step": 351 }, { "epoch": 0.9214659685863874, "grad_norm": 1.7982288599014282, "learning_rate": 6.064269879765805e-05, "loss": 2.8771, "step": 352 }, { "epoch": 0.9240837696335078, "grad_norm": 1.6815402507781982, "learning_rate": 6.043090299130978e-05, "loss": 2.7996, "step": 353 }, { "epoch": 0.9267015706806283, "grad_norm": 1.8650509119033813, "learning_rate": 6.021891132531825e-05, "loss": 2.7655, "step": 354 }, { "epoch": 0.9293193717277487, "grad_norm": 1.7183104753494263, "learning_rate": 6.000672778022208e-05, "loss": 2.6123, "step": 355 }, { "epoch": 0.9319371727748691, "grad_norm": 1.7947955131530762, "learning_rate": 5.979435634016277e-05, "loss": 2.6827, "step": 356 }, { "epoch": 0.9345549738219895, "grad_norm": 1.6206470727920532, "learning_rate": 5.95818009928099e-05, "loss": 2.5481, "step": 357 }, { "epoch": 0.93717277486911, "grad_norm": 1.8061578273773193, "learning_rate": 5.9369065729286245e-05, "loss": 2.7013, "step": 358 }, { "epoch": 0.9397905759162304, "grad_norm": 1.7426854372024536, "learning_rate": 5.9156154544092815e-05, "loss": 2.8558, "step": 359 }, { "epoch": 0.9424083769633508, "grad_norm": 1.7318534851074219, "learning_rate": 5.894307143503393e-05, "loss": 2.8222, "step": 360 }, { "epoch": 0.9450261780104712, "grad_norm": 1.6571924686431885, "learning_rate": 5.8729820403142054e-05, "loss": 2.7089, "step": 361 }, { "epoch": 0.9476439790575916, "grad_norm": 1.6273107528686523, "learning_rate": 5.851640545260276e-05, "loss": 2.5964, "step": 362 }, { "epoch": 0.9502617801047121, "grad_norm": 1.7294092178344727, "learning_rate": 5.830283059067947e-05, "loss": 2.6222, "step": 363 }, { "epoch": 0.9528795811518325, "grad_norm": 1.631488561630249, "learning_rate": 5.808909982763825e-05, "loss": 2.711, "step": 364 }, { "epoch": 0.9554973821989529, "grad_norm": 1.7980823516845703, "learning_rate": 5.787521717667247e-05, "loss": 2.5482, "step": 365 }, { "epoch": 0.9581151832460733, "grad_norm": 1.5978546142578125, "learning_rate": 5.7661186653827535e-05, "loss": 2.6563, "step": 366 }, { "epoch": 0.9607329842931938, "grad_norm": 1.7057075500488281, "learning_rate": 5.744701227792538e-05, "loss": 2.7849, "step": 367 }, { "epoch": 0.9633507853403142, "grad_norm": 1.9055167436599731, "learning_rate": 5.7232698070489065e-05, "loss": 2.8311, "step": 368 }, { "epoch": 0.9659685863874345, "grad_norm": 1.6838874816894531, "learning_rate": 5.701824805566722e-05, "loss": 2.6986, "step": 369 }, { "epoch": 0.9685863874345549, "grad_norm": 1.7164732217788696, "learning_rate": 5.680366626015855e-05, "loss": 2.7211, "step": 370 }, { "epoch": 0.9712041884816754, "grad_norm": 4.532098770141602, "learning_rate": 5.658895671313619e-05, "loss": 2.6758, "step": 371 }, { "epoch": 0.9738219895287958, "grad_norm": 1.914410948753357, "learning_rate": 5.6374123446172e-05, "loss": 2.8492, "step": 372 }, { "epoch": 0.9764397905759162, "grad_norm": 1.6964002847671509, "learning_rate": 5.615917049316095e-05, "loss": 2.6859, "step": 373 }, { "epoch": 0.9790575916230366, "grad_norm": 1.863756537437439, "learning_rate": 5.5944101890245324e-05, "loss": 2.6823, "step": 374 }, { "epoch": 0.981675392670157, "grad_norm": 1.729614496231079, "learning_rate": 5.5728921675738964e-05, "loss": 2.6016, "step": 375 }, { "epoch": 0.9842931937172775, "grad_norm": 1.7206858396530151, "learning_rate": 5.551363389005144e-05, "loss": 2.6316, "step": 376 }, { "epoch": 0.9869109947643979, "grad_norm": 1.6375540494918823, "learning_rate": 5.529824257561212e-05, "loss": 2.5566, "step": 377 }, { "epoch": 0.9895287958115183, "grad_norm": 1.7328511476516724, "learning_rate": 5.508275177679436e-05, "loss": 2.7835, "step": 378 }, { "epoch": 0.9921465968586387, "grad_norm": 1.7834824323654175, "learning_rate": 5.486716553983951e-05, "loss": 2.6773, "step": 379 }, { "epoch": 0.9947643979057592, "grad_norm": 1.8649531602859497, "learning_rate": 5.4651487912780906e-05, "loss": 2.8563, "step": 380 }, { "epoch": 0.9973821989528796, "grad_norm": 1.7680840492248535, "learning_rate": 5.443572294536801e-05, "loss": 2.6923, "step": 381 }, { "epoch": 1.0, "grad_norm": 2.223090648651123, "learning_rate": 5.4219874688990146e-05, "loss": 2.8072, "step": 382 } ], "logging_steps": 1, "max_steps": 764, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 382, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.459853308847718e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }