{ "best_metric": 0.8386966586112976, "best_model_checkpoint": "miner_id_24/checkpoint-200", "epoch": 0.10933442667760011, "eval_steps": 50, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005466721333880005, "grad_norm": 0.4661848545074463, "learning_rate": 2.333333333333333e-06, "loss": 0.7648, "step": 1 }, { "epoch": 0.0005466721333880005, "eval_loss": 1.3695563077926636, "eval_runtime": 94.695, "eval_samples_per_second": 32.536, "eval_steps_per_second": 8.142, "step": 1 }, { "epoch": 0.001093344266776001, "grad_norm": 0.411937952041626, "learning_rate": 4.666666666666666e-06, "loss": 0.7517, "step": 2 }, { "epoch": 0.0016400164001640015, "grad_norm": 0.539547324180603, "learning_rate": 7e-06, "loss": 0.9216, "step": 3 }, { "epoch": 0.002186688533552002, "grad_norm": 0.5318194627761841, "learning_rate": 9.333333333333333e-06, "loss": 0.8582, "step": 4 }, { "epoch": 0.0027333606669400026, "grad_norm": 0.5687560439109802, "learning_rate": 1.1666666666666665e-05, "loss": 0.9711, "step": 5 }, { "epoch": 0.003280032800328003, "grad_norm": 0.7335965633392334, "learning_rate": 1.4e-05, "loss": 0.9818, "step": 6 }, { "epoch": 0.003826704933716004, "grad_norm": 0.4608137309551239, "learning_rate": 1.633333333333333e-05, "loss": 0.8496, "step": 7 }, { "epoch": 0.004373377067104004, "grad_norm": 0.5464603900909424, "learning_rate": 1.8666666666666665e-05, "loss": 0.9922, "step": 8 }, { "epoch": 0.004920049200492005, "grad_norm": 0.557895839214325, "learning_rate": 2.1e-05, "loss": 1.0499, "step": 9 }, { "epoch": 0.005466721333880005, "grad_norm": 0.43142303824424744, "learning_rate": 2.333333333333333e-05, "loss": 0.9916, "step": 10 }, { "epoch": 0.006013393467268006, "grad_norm": 0.39260706305503845, "learning_rate": 2.5666666666666663e-05, "loss": 0.9236, "step": 11 }, { "epoch": 0.006560065600656006, "grad_norm": 0.40968865156173706, "learning_rate": 2.8e-05, "loss": 1.0978, "step": 12 }, { "epoch": 0.007106737734044007, "grad_norm": 0.4441259503364563, "learning_rate": 3.0333333333333333e-05, "loss": 1.0146, "step": 13 }, { "epoch": 0.007653409867432008, "grad_norm": 0.40066277980804443, "learning_rate": 3.266666666666666e-05, "loss": 0.916, "step": 14 }, { "epoch": 0.008200082000820008, "grad_norm": 0.4672339856624603, "learning_rate": 3.5e-05, "loss": 1.0468, "step": 15 }, { "epoch": 0.008746754134208008, "grad_norm": 0.48708584904670715, "learning_rate": 3.733333333333333e-05, "loss": 1.049, "step": 16 }, { "epoch": 0.00929342626759601, "grad_norm": 0.5237523913383484, "learning_rate": 3.9666666666666664e-05, "loss": 1.0419, "step": 17 }, { "epoch": 0.00984009840098401, "grad_norm": 0.5612183213233948, "learning_rate": 4.2e-05, "loss": 0.9977, "step": 18 }, { "epoch": 0.01038677053437201, "grad_norm": 0.5430253148078918, "learning_rate": 4.4333333333333324e-05, "loss": 1.146, "step": 19 }, { "epoch": 0.01093344266776001, "grad_norm": 0.5050835013389587, "learning_rate": 4.666666666666666e-05, "loss": 0.9679, "step": 20 }, { "epoch": 0.011480114801148012, "grad_norm": 0.4491889774799347, "learning_rate": 4.899999999999999e-05, "loss": 0.8918, "step": 21 }, { "epoch": 0.012026786934536012, "grad_norm": 0.46133682131767273, "learning_rate": 5.1333333333333325e-05, "loss": 0.891, "step": 22 }, { "epoch": 0.012573459067924012, "grad_norm": 0.5220205783843994, "learning_rate": 5.3666666666666666e-05, "loss": 0.9138, "step": 23 }, { "epoch": 0.013120131201312012, "grad_norm": 0.5471235513687134, "learning_rate": 5.6e-05, "loss": 1.0219, "step": 24 }, { "epoch": 0.013666803334700014, "grad_norm": 0.5401260852813721, "learning_rate": 5.833333333333333e-05, "loss": 0.9114, "step": 25 }, { "epoch": 0.014213475468088014, "grad_norm": 0.5504897236824036, "learning_rate": 6.0666666666666666e-05, "loss": 0.9869, "step": 26 }, { "epoch": 0.014760147601476014, "grad_norm": 0.6574556827545166, "learning_rate": 6.3e-05, "loss": 0.9166, "step": 27 }, { "epoch": 0.015306819734864016, "grad_norm": 0.6381729245185852, "learning_rate": 6.533333333333333e-05, "loss": 1.0102, "step": 28 }, { "epoch": 0.015853491868252016, "grad_norm": 0.6967756748199463, "learning_rate": 6.766666666666667e-05, "loss": 0.9655, "step": 29 }, { "epoch": 0.016400164001640016, "grad_norm": 0.7127764225006104, "learning_rate": 7e-05, "loss": 1.0242, "step": 30 }, { "epoch": 0.016946836135028016, "grad_norm": 0.6833203434944153, "learning_rate": 6.999402376603183e-05, "loss": 0.9315, "step": 31 }, { "epoch": 0.017493508268416016, "grad_norm": 0.8914316296577454, "learning_rate": 6.99760971050058e-05, "loss": 1.0732, "step": 32 }, { "epoch": 0.018040180401804017, "grad_norm": 1.0326182842254639, "learning_rate": 6.994622613886018e-05, "loss": 1.0459, "step": 33 }, { "epoch": 0.01858685253519202, "grad_norm": 0.8010824918746948, "learning_rate": 6.990442106850258e-05, "loss": 0.986, "step": 34 }, { "epoch": 0.01913352466858002, "grad_norm": 0.9589248895645142, "learning_rate": 6.98506961703262e-05, "loss": 0.9221, "step": 35 }, { "epoch": 0.01968019680196802, "grad_norm": 0.7887236475944519, "learning_rate": 6.978506979133457e-05, "loss": 1.0374, "step": 36 }, { "epoch": 0.02022686893535602, "grad_norm": 0.8019972443580627, "learning_rate": 6.9707564342876e-05, "loss": 0.8858, "step": 37 }, { "epoch": 0.02077354106874402, "grad_norm": 0.8885377049446106, "learning_rate": 6.96182062929901e-05, "loss": 1.0831, "step": 38 }, { "epoch": 0.02132021320213202, "grad_norm": 0.9791058301925659, "learning_rate": 6.951702615736908e-05, "loss": 1.0472, "step": 39 }, { "epoch": 0.02186688533552002, "grad_norm": 1.0434457063674927, "learning_rate": 6.940405848893656e-05, "loss": 0.9667, "step": 40 }, { "epoch": 0.022413557468908024, "grad_norm": 1.0951871871948242, "learning_rate": 6.92793418660478e-05, "loss": 1.1452, "step": 41 }, { "epoch": 0.022960229602296024, "grad_norm": 1.0019595623016357, "learning_rate": 6.914291887931528e-05, "loss": 0.8607, "step": 42 }, { "epoch": 0.023506901735684024, "grad_norm": 1.2730623483657837, "learning_rate": 6.899483611706398e-05, "loss": 1.0172, "step": 43 }, { "epoch": 0.024053573869072024, "grad_norm": 1.064156413078308, "learning_rate": 6.883514414942155e-05, "loss": 0.9434, "step": 44 }, { "epoch": 0.024600246002460024, "grad_norm": 1.1558951139450073, "learning_rate": 6.866389751104867e-05, "loss": 0.9083, "step": 45 }, { "epoch": 0.025146918135848025, "grad_norm": 1.3744535446166992, "learning_rate": 6.848115468251542e-05, "loss": 0.8842, "step": 46 }, { "epoch": 0.025693590269236025, "grad_norm": 1.5875253677368164, "learning_rate": 6.828697807033038e-05, "loss": 1.0323, "step": 47 }, { "epoch": 0.026240262402624025, "grad_norm": 1.4661047458648682, "learning_rate": 6.808143398562868e-05, "loss": 0.8986, "step": 48 }, { "epoch": 0.026786934536012028, "grad_norm": 1.6527341604232788, "learning_rate": 6.786459262152698e-05, "loss": 1.0423, "step": 49 }, { "epoch": 0.02733360666940003, "grad_norm": 4.2761406898498535, "learning_rate": 6.763652802915244e-05, "loss": 1.1307, "step": 50 }, { "epoch": 0.02733360666940003, "eval_loss": 0.9474524855613708, "eval_runtime": 94.8645, "eval_samples_per_second": 32.478, "eval_steps_per_second": 8.127, "step": 50 }, { "epoch": 0.02788027880278803, "grad_norm": 0.9167900681495667, "learning_rate": 6.739731809235446e-05, "loss": 0.7292, "step": 51 }, { "epoch": 0.02842695093617603, "grad_norm": 1.0346499681472778, "learning_rate": 6.71470445011073e-05, "loss": 0.7878, "step": 52 }, { "epoch": 0.02897362306956403, "grad_norm": 0.9062778949737549, "learning_rate": 6.688579272361309e-05, "loss": 0.8643, "step": 53 }, { "epoch": 0.02952029520295203, "grad_norm": 0.6097351908683777, "learning_rate": 6.66136519771145e-05, "loss": 0.8222, "step": 54 }, { "epoch": 0.03006696733634003, "grad_norm": 0.5191702246665955, "learning_rate": 6.633071519742718e-05, "loss": 0.9669, "step": 55 }, { "epoch": 0.030613639469728032, "grad_norm": 0.37056273221969604, "learning_rate": 6.603707900720217e-05, "loss": 0.8109, "step": 56 }, { "epoch": 0.031160311603116032, "grad_norm": 0.3645178973674774, "learning_rate": 6.573284368292943e-05, "loss": 0.921, "step": 57 }, { "epoch": 0.03170698373650403, "grad_norm": 0.35407713055610657, "learning_rate": 6.541811312069348e-05, "loss": 0.8393, "step": 58 }, { "epoch": 0.03225365586989203, "grad_norm": 0.30585721135139465, "learning_rate": 6.509299480069303e-05, "loss": 0.8196, "step": 59 }, { "epoch": 0.03280032800328003, "grad_norm": 0.31569743156433105, "learning_rate": 6.47575997505365e-05, "loss": 0.9089, "step": 60 }, { "epoch": 0.03334700013666803, "grad_norm": 0.31442388892173767, "learning_rate": 6.441204250732624e-05, "loss": 0.7743, "step": 61 }, { "epoch": 0.03389367227005603, "grad_norm": 0.3461879789829254, "learning_rate": 6.405644107854427e-05, "loss": 0.8485, "step": 62 }, { "epoch": 0.03444034440344403, "grad_norm": 0.346314400434494, "learning_rate": 6.369091690175273e-05, "loss": 0.7661, "step": 63 }, { "epoch": 0.03498701653683203, "grad_norm": 0.3185812532901764, "learning_rate": 6.331559480312315e-05, "loss": 0.8583, "step": 64 }, { "epoch": 0.03553368867022003, "grad_norm": 0.3503943979740143, "learning_rate": 6.293060295480838e-05, "loss": 0.8725, "step": 65 }, { "epoch": 0.03608036080360803, "grad_norm": 0.3459616005420685, "learning_rate": 6.25360728311719e-05, "loss": 0.8382, "step": 66 }, { "epoch": 0.03662703293699603, "grad_norm": 0.38812822103500366, "learning_rate": 6.213213916388954e-05, "loss": 0.9174, "step": 67 }, { "epoch": 0.03717370507038404, "grad_norm": 0.37094104290008545, "learning_rate": 6.171893989593859e-05, "loss": 0.881, "step": 68 }, { "epoch": 0.03772037720377204, "grad_norm": 0.5786832571029663, "learning_rate": 6.129661613449057e-05, "loss": 0.9016, "step": 69 }, { "epoch": 0.03826704933716004, "grad_norm": 0.6645287871360779, "learning_rate": 6.086531210272306e-05, "loss": 0.965, "step": 70 }, { "epoch": 0.03881372147054804, "grad_norm": 0.43764594197273254, "learning_rate": 6.042517509056784e-05, "loss": 0.9049, "step": 71 }, { "epoch": 0.03936039360393604, "grad_norm": 0.34731465578079224, "learning_rate": 5.997635540441133e-05, "loss": 0.8948, "step": 72 }, { "epoch": 0.03990706573732404, "grad_norm": 0.40091952681541443, "learning_rate": 5.9519006315765176e-05, "loss": 0.8426, "step": 73 }, { "epoch": 0.04045373787071204, "grad_norm": 0.3977827727794647, "learning_rate": 5.9053284008924185e-05, "loss": 0.8352, "step": 74 }, { "epoch": 0.04100041000410004, "grad_norm": 0.4238167107105255, "learning_rate": 5.85793475276295e-05, "loss": 0.8615, "step": 75 }, { "epoch": 0.04154708213748804, "grad_norm": 0.46092405915260315, "learning_rate": 5.809735872075529e-05, "loss": 0.8255, "step": 76 }, { "epoch": 0.04209375427087604, "grad_norm": 0.4420395791530609, "learning_rate": 5.760748218703755e-05, "loss": 0.7647, "step": 77 }, { "epoch": 0.04264042640426404, "grad_norm": 0.4960534870624542, "learning_rate": 5.710988521886378e-05, "loss": 0.909, "step": 78 }, { "epoch": 0.04318709853765204, "grad_norm": 0.49868643283843994, "learning_rate": 5.660473774514275e-05, "loss": 0.7568, "step": 79 }, { "epoch": 0.04373377067104004, "grad_norm": 0.5315876603126526, "learning_rate": 5.6092212273273975e-05, "loss": 0.8924, "step": 80 }, { "epoch": 0.04428044280442804, "grad_norm": 0.5740758776664734, "learning_rate": 5.557248383023655e-05, "loss": 0.8379, "step": 81 }, { "epoch": 0.04482711493781605, "grad_norm": 0.613091766834259, "learning_rate": 5.5045729902817676e-05, "loss": 0.9659, "step": 82 }, { "epoch": 0.04537378707120405, "grad_norm": 0.6900278329849243, "learning_rate": 5.4512130377000987e-05, "loss": 0.9506, "step": 83 }, { "epoch": 0.04592045920459205, "grad_norm": 0.6675617694854736, "learning_rate": 5.397186747653573e-05, "loss": 0.9986, "step": 84 }, { "epoch": 0.04646713133798005, "grad_norm": 0.6233535408973694, "learning_rate": 5.342512570070745e-05, "loss": 0.9033, "step": 85 }, { "epoch": 0.04701380347136805, "grad_norm": 0.659621000289917, "learning_rate": 5.287209176133174e-05, "loss": 0.9068, "step": 86 }, { "epoch": 0.04756047560475605, "grad_norm": 0.759048342704773, "learning_rate": 5.231295451899226e-05, "loss": 0.9384, "step": 87 }, { "epoch": 0.04810714773814405, "grad_norm": 0.8321371078491211, "learning_rate": 5.174790491854502e-05, "loss": 1.0077, "step": 88 }, { "epoch": 0.04865381987153205, "grad_norm": 0.7181084156036377, "learning_rate": 5.117713592391096e-05, "loss": 1.0386, "step": 89 }, { "epoch": 0.04920049200492005, "grad_norm": 0.8390389084815979, "learning_rate": 5.060084245217884e-05, "loss": 1.0526, "step": 90 }, { "epoch": 0.04974716413830805, "grad_norm": 0.7568452954292297, "learning_rate": 5.0019221307041306e-05, "loss": 0.9571, "step": 91 }, { "epoch": 0.05029383627169605, "grad_norm": 1.033366322517395, "learning_rate": 4.943247111158662e-05, "loss": 0.9084, "step": 92 }, { "epoch": 0.05084050840508405, "grad_norm": 0.9008201360702515, "learning_rate": 4.884079224046898e-05, "loss": 0.8388, "step": 93 }, { "epoch": 0.05138718053847205, "grad_norm": 0.8567910194396973, "learning_rate": 4.824438675148086e-05, "loss": 0.9807, "step": 94 }, { "epoch": 0.05193385267186005, "grad_norm": 1.0424281358718872, "learning_rate": 4.764345831655036e-05, "loss": 1.0143, "step": 95 }, { "epoch": 0.05248052480524805, "grad_norm": 1.1355541944503784, "learning_rate": 4.703821215218748e-05, "loss": 0.9176, "step": 96 }, { "epoch": 0.053027196938636056, "grad_norm": 1.2170919179916382, "learning_rate": 4.642885494940291e-05, "loss": 0.8303, "step": 97 }, { "epoch": 0.053573869072024057, "grad_norm": 1.626427173614502, "learning_rate": 4.581559480312316e-05, "loss": 0.8297, "step": 98 }, { "epoch": 0.05412054120541206, "grad_norm": 1.3740547895431519, "learning_rate": 4.519864114112636e-05, "loss": 0.7661, "step": 99 }, { "epoch": 0.05466721333880006, "grad_norm": 2.6349620819091797, "learning_rate": 4.45782046525229e-05, "loss": 1.0357, "step": 100 }, { "epoch": 0.05466721333880006, "eval_loss": 0.8693193197250366, "eval_runtime": 94.6807, "eval_samples_per_second": 32.541, "eval_steps_per_second": 8.143, "step": 100 }, { "epoch": 0.05521388547218806, "grad_norm": 0.44926929473876953, "learning_rate": 4.3954497215805244e-05, "loss": 0.63, "step": 101 }, { "epoch": 0.05576055760557606, "grad_norm": 0.34577685594558716, "learning_rate": 4.332773182649165e-05, "loss": 0.6255, "step": 102 }, { "epoch": 0.05630722973896406, "grad_norm": 0.37551775574684143, "learning_rate": 4.2698122524388405e-05, "loss": 0.8668, "step": 103 }, { "epoch": 0.05685390187235206, "grad_norm": 0.3831069767475128, "learning_rate": 4.206588432049535e-05, "loss": 0.7327, "step": 104 }, { "epoch": 0.05740057400574006, "grad_norm": 0.368376761674881, "learning_rate": 4.143123312357996e-05, "loss": 0.7613, "step": 105 }, { "epoch": 0.05794724613912806, "grad_norm": 0.2958766222000122, "learning_rate": 4.079438566644454e-05, "loss": 0.8557, "step": 106 }, { "epoch": 0.05849391827251606, "grad_norm": 0.2869422137737274, "learning_rate": 4.015555943191231e-05, "loss": 0.8202, "step": 107 }, { "epoch": 0.05904059040590406, "grad_norm": 0.2743949890136719, "learning_rate": 3.9514972578557114e-05, "loss": 0.824, "step": 108 }, { "epoch": 0.05958726253929206, "grad_norm": 0.2857745289802551, "learning_rate": 3.8872843866202525e-05, "loss": 0.8618, "step": 109 }, { "epoch": 0.06013393467268006, "grad_norm": 0.2635364532470703, "learning_rate": 3.8229392581215565e-05, "loss": 0.7633, "step": 110 }, { "epoch": 0.06068060680606806, "grad_norm": 0.27216577529907227, "learning_rate": 3.7584838461620587e-05, "loss": 0.8427, "step": 111 }, { "epoch": 0.061227278939456065, "grad_norm": 0.28717127442359924, "learning_rate": 3.693940162205895e-05, "loss": 0.7745, "step": 112 }, { "epoch": 0.061773951072844065, "grad_norm": 0.2799830138683319, "learning_rate": 3.629330247862007e-05, "loss": 0.7972, "step": 113 }, { "epoch": 0.062320623206232065, "grad_norm": 0.3147888779640198, "learning_rate": 3.564676167356954e-05, "loss": 0.8222, "step": 114 }, { "epoch": 0.06286729533962006, "grad_norm": 0.30850040912628174, "learning_rate": 3.5e-05, "loss": 0.8039, "step": 115 }, { "epoch": 0.06341396747300806, "grad_norm": 0.3144112229347229, "learning_rate": 3.435323832643046e-05, "loss": 0.7823, "step": 116 }, { "epoch": 0.06396063960639606, "grad_norm": 0.3839268088340759, "learning_rate": 3.370669752137993e-05, "loss": 0.9132, "step": 117 }, { "epoch": 0.06450731173978407, "grad_norm": 0.37383535504341125, "learning_rate": 3.306059837794105e-05, "loss": 0.8359, "step": 118 }, { "epoch": 0.06505398387317206, "grad_norm": 0.42678016424179077, "learning_rate": 3.241516153837941e-05, "loss": 0.9635, "step": 119 }, { "epoch": 0.06560065600656007, "grad_norm": 0.478986918926239, "learning_rate": 3.177060741878443e-05, "loss": 0.8426, "step": 120 }, { "epoch": 0.06614732813994807, "grad_norm": 0.33792608976364136, "learning_rate": 3.1127156133797475e-05, "loss": 0.7594, "step": 121 }, { "epoch": 0.06669400027333607, "grad_norm": 0.401961088180542, "learning_rate": 3.048502742144289e-05, "loss": 0.9038, "step": 122 }, { "epoch": 0.06724067240672407, "grad_norm": 0.3711259067058563, "learning_rate": 2.984444056808768e-05, "loss": 0.8834, "step": 123 }, { "epoch": 0.06778734454011207, "grad_norm": 0.4284830689430237, "learning_rate": 2.9205614333555444e-05, "loss": 0.8564, "step": 124 }, { "epoch": 0.06833401667350007, "grad_norm": 0.4255787134170532, "learning_rate": 2.856876687642003e-05, "loss": 0.7818, "step": 125 }, { "epoch": 0.06888068880688807, "grad_norm": 0.4699144959449768, "learning_rate": 2.7934115679504645e-05, "loss": 0.8755, "step": 126 }, { "epoch": 0.06942736094027607, "grad_norm": 0.4922056198120117, "learning_rate": 2.7301877475611606e-05, "loss": 0.908, "step": 127 }, { "epoch": 0.06997403307366407, "grad_norm": 0.47387874126434326, "learning_rate": 2.667226817350835e-05, "loss": 0.9525, "step": 128 }, { "epoch": 0.07052070520705207, "grad_norm": 0.47072669863700867, "learning_rate": 2.604550278419475e-05, "loss": 0.8472, "step": 129 }, { "epoch": 0.07106737734044007, "grad_norm": 0.595217764377594, "learning_rate": 2.54217953474771e-05, "loss": 0.8562, "step": 130 }, { "epoch": 0.07161404947382807, "grad_norm": 0.585902214050293, "learning_rate": 2.4801358858873636e-05, "loss": 0.8419, "step": 131 }, { "epoch": 0.07216072160721607, "grad_norm": 0.6359130144119263, "learning_rate": 2.4184405196876842e-05, "loss": 0.8576, "step": 132 }, { "epoch": 0.07270739374060407, "grad_norm": 0.532152533531189, "learning_rate": 2.3571145050597088e-05, "loss": 0.8793, "step": 133 }, { "epoch": 0.07325406587399207, "grad_norm": 0.6238685250282288, "learning_rate": 2.296178784781251e-05, "loss": 0.8179, "step": 134 }, { "epoch": 0.07380073800738007, "grad_norm": 0.7119914889335632, "learning_rate": 2.2356541683449646e-05, "loss": 0.9274, "step": 135 }, { "epoch": 0.07434741014076808, "grad_norm": 0.7729023694992065, "learning_rate": 2.175561324851914e-05, "loss": 1.088, "step": 136 }, { "epoch": 0.07489408227415607, "grad_norm": 0.7109841108322144, "learning_rate": 2.1159207759531013e-05, "loss": 0.9875, "step": 137 }, { "epoch": 0.07544075440754408, "grad_norm": 0.6977630257606506, "learning_rate": 2.0567528888413382e-05, "loss": 0.8897, "step": 138 }, { "epoch": 0.07598742654093207, "grad_norm": 0.7141391038894653, "learning_rate": 1.9980778692958684e-05, "loss": 0.9585, "step": 139 }, { "epoch": 0.07653409867432008, "grad_norm": 0.7913526296615601, "learning_rate": 1.9399157547821162e-05, "loss": 0.8552, "step": 140 }, { "epoch": 0.07708077080770807, "grad_norm": 0.9253504276275635, "learning_rate": 1.882286407608904e-05, "loss": 0.88, "step": 141 }, { "epoch": 0.07762744294109608, "grad_norm": 1.1985421180725098, "learning_rate": 1.825209508145497e-05, "loss": 0.9832, "step": 142 }, { "epoch": 0.07817411507448407, "grad_norm": 0.9133749008178711, "learning_rate": 1.7687045481007746e-05, "loss": 0.8496, "step": 143 }, { "epoch": 0.07872078720787208, "grad_norm": 1.0693060159683228, "learning_rate": 1.712790823866826e-05, "loss": 0.9679, "step": 144 }, { "epoch": 0.07926745934126007, "grad_norm": 1.4201616048812866, "learning_rate": 1.657487429929254e-05, "loss": 0.9618, "step": 145 }, { "epoch": 0.07981413147464808, "grad_norm": 1.446235179901123, "learning_rate": 1.602813252346427e-05, "loss": 0.9855, "step": 146 }, { "epoch": 0.08036080360803607, "grad_norm": 1.2012956142425537, "learning_rate": 1.5487869622999004e-05, "loss": 0.8761, "step": 147 }, { "epoch": 0.08090747574142408, "grad_norm": 1.4860199689865112, "learning_rate": 1.4954270097182317e-05, "loss": 0.8868, "step": 148 }, { "epoch": 0.08145414787481207, "grad_norm": 1.4566800594329834, "learning_rate": 1.4427516169763444e-05, "loss": 0.8158, "step": 149 }, { "epoch": 0.08200082000820008, "grad_norm": 2.238769769668579, "learning_rate": 1.3907787726726029e-05, "loss": 0.9074, "step": 150 }, { "epoch": 0.08200082000820008, "eval_loss": 0.8439501523971558, "eval_runtime": 94.8117, "eval_samples_per_second": 32.496, "eval_steps_per_second": 8.132, "step": 150 }, { "epoch": 0.08254749214158809, "grad_norm": 0.21387195587158203, "learning_rate": 1.339526225485725e-05, "loss": 0.5987, "step": 151 }, { "epoch": 0.08309416427497608, "grad_norm": 0.21969622373580933, "learning_rate": 1.2890114781136224e-05, "loss": 0.701, "step": 152 }, { "epoch": 0.08364083640836409, "grad_norm": 0.23904825747013092, "learning_rate": 1.239251781296245e-05, "loss": 0.7627, "step": 153 }, { "epoch": 0.08418750854175208, "grad_norm": 0.25487756729125977, "learning_rate": 1.1902641279244715e-05, "loss": 0.7506, "step": 154 }, { "epoch": 0.08473418067514009, "grad_norm": 0.2533106207847595, "learning_rate": 1.1420652472370497e-05, "loss": 0.7238, "step": 155 }, { "epoch": 0.08528085280852808, "grad_norm": 0.2788962423801422, "learning_rate": 1.0946715991075805e-05, "loss": 0.8004, "step": 156 }, { "epoch": 0.08582752494191609, "grad_norm": 0.2542433738708496, "learning_rate": 1.0480993684234815e-05, "loss": 0.7999, "step": 157 }, { "epoch": 0.08637419707530408, "grad_norm": 0.2701437175273895, "learning_rate": 1.0023644595588671e-05, "loss": 0.7623, "step": 158 }, { "epoch": 0.08692086920869209, "grad_norm": 0.2716737389564514, "learning_rate": 9.57482490943216e-06, "loss": 0.8127, "step": 159 }, { "epoch": 0.08746754134208008, "grad_norm": 0.28511807322502136, "learning_rate": 9.134687897276934e-06, "loss": 0.8619, "step": 160 }, { "epoch": 0.08801421347546809, "grad_norm": 0.32100751996040344, "learning_rate": 8.703383865509432e-06, "loss": 0.8368, "step": 161 }, { "epoch": 0.08856088560885608, "grad_norm": 0.27488431334495544, "learning_rate": 8.281060104061394e-06, "loss": 0.7795, "step": 162 }, { "epoch": 0.08910755774224409, "grad_norm": 0.2900071442127228, "learning_rate": 7.867860836110453e-06, "loss": 0.8229, "step": 163 }, { "epoch": 0.0896542298756321, "grad_norm": 0.3044150471687317, "learning_rate": 7.463927168828087e-06, "loss": 0.8705, "step": 164 }, { "epoch": 0.09020090200902009, "grad_norm": 0.34009850025177, "learning_rate": 7.069397045191617e-06, "loss": 0.9019, "step": 165 }, { "epoch": 0.0907475741424081, "grad_norm": 0.33406728506088257, "learning_rate": 6.684405196876842e-06, "loss": 0.7769, "step": 166 }, { "epoch": 0.09129424627579609, "grad_norm": 0.35519149899482727, "learning_rate": 6.309083098247264e-06, "loss": 0.8062, "step": 167 }, { "epoch": 0.0918409184091841, "grad_norm": 0.33212265372276306, "learning_rate": 5.943558921455733e-06, "loss": 0.7822, "step": 168 }, { "epoch": 0.09238759054257209, "grad_norm": 0.3488451838493347, "learning_rate": 5.587957492673759e-06, "loss": 0.7944, "step": 169 }, { "epoch": 0.0929342626759601, "grad_norm": 0.4000808298587799, "learning_rate": 5.2424002494635095e-06, "loss": 0.823, "step": 170 }, { "epoch": 0.09348093480934809, "grad_norm": 0.37309879064559937, "learning_rate": 4.9070051993069636e-06, "loss": 0.8506, "step": 171 }, { "epoch": 0.0940276069427361, "grad_norm": 0.3660164773464203, "learning_rate": 4.581886879306507e-06, "loss": 0.8042, "step": 172 }, { "epoch": 0.09457427907612409, "grad_norm": 0.389154851436615, "learning_rate": 4.2671563170705725e-06, "loss": 0.9166, "step": 173 }, { "epoch": 0.0951209512095121, "grad_norm": 0.3975100815296173, "learning_rate": 3.962920992797834e-06, "loss": 0.7654, "step": 174 }, { "epoch": 0.09566762334290009, "grad_norm": 0.4443868100643158, "learning_rate": 3.6692848025728216e-06, "loss": 0.8351, "step": 175 }, { "epoch": 0.0962142954762881, "grad_norm": 0.4617209732532501, "learning_rate": 3.38634802288549e-06, "loss": 0.8746, "step": 176 }, { "epoch": 0.09676096760967609, "grad_norm": 0.4271644651889801, "learning_rate": 3.1142072763869042e-06, "loss": 0.7862, "step": 177 }, { "epoch": 0.0973076397430641, "grad_norm": 0.5230273604393005, "learning_rate": 2.852955498892694e-06, "loss": 0.7907, "step": 178 }, { "epoch": 0.0978543118764521, "grad_norm": 0.49080246686935425, "learning_rate": 2.6026819076455325e-06, "loss": 0.8707, "step": 179 }, { "epoch": 0.0984009840098401, "grad_norm": 0.524896502494812, "learning_rate": 2.36347197084755e-06, "loss": 0.8869, "step": 180 }, { "epoch": 0.0989476561432281, "grad_norm": 0.5297053456306458, "learning_rate": 2.1354073784730253e-06, "loss": 0.855, "step": 181 }, { "epoch": 0.0994943282766161, "grad_norm": 0.6338819265365601, "learning_rate": 1.9185660143713184e-06, "loss": 1.0045, "step": 182 }, { "epoch": 0.1000410004100041, "grad_norm": 0.6989834308624268, "learning_rate": 1.7130219296696263e-06, "loss": 0.9497, "step": 183 }, { "epoch": 0.1005876725433921, "grad_norm": 0.602967381477356, "learning_rate": 1.5188453174845743e-06, "loss": 0.8483, "step": 184 }, { "epoch": 0.1011343446767801, "grad_norm": 0.7088273763656616, "learning_rate": 1.3361024889513333e-06, "loss": 0.846, "step": 185 }, { "epoch": 0.1016810168101681, "grad_norm": 0.7697593569755554, "learning_rate": 1.16485585057844e-06, "loss": 0.9718, "step": 186 }, { "epoch": 0.1022276889435561, "grad_norm": 0.6648566722869873, "learning_rate": 1.0051638829360127e-06, "loss": 0.9544, "step": 187 }, { "epoch": 0.1027743610769441, "grad_norm": 0.8641712069511414, "learning_rate": 8.570811206847189e-07, "loss": 0.9615, "step": 188 }, { "epoch": 0.1033210332103321, "grad_norm": 0.8209101557731628, "learning_rate": 7.206581339521939e-07, "loss": 0.9253, "step": 189 }, { "epoch": 0.1038677053437201, "grad_norm": 0.7885981202125549, "learning_rate": 5.959415110634375e-07, "loss": 0.9993, "step": 190 }, { "epoch": 0.1044143774771081, "grad_norm": 0.8673349022865295, "learning_rate": 4.829738426309099e-07, "loss": 0.8788, "step": 191 }, { "epoch": 0.1049610496104961, "grad_norm": 1.1715073585510254, "learning_rate": 3.817937070098914e-07, "loss": 0.9042, "step": 192 }, { "epoch": 0.1055077217438841, "grad_norm": 0.8940293192863464, "learning_rate": 2.9243565712400384e-07, "loss": 0.9155, "step": 193 }, { "epoch": 0.10605439387727211, "grad_norm": 1.1870354413986206, "learning_rate": 2.1493020866542365e-07, "loss": 0.7754, "step": 194 }, { "epoch": 0.1066010660106601, "grad_norm": 1.0665806531906128, "learning_rate": 1.4930382967379363e-07, "loss": 0.7945, "step": 195 }, { "epoch": 0.10714773814404811, "grad_norm": 1.1132820844650269, "learning_rate": 9.557893149741924e-08, "loss": 0.8022, "step": 196 }, { "epoch": 0.1076944102774361, "grad_norm": 1.1399116516113281, "learning_rate": 5.377386113981197e-08, "loss": 0.7458, "step": 197 }, { "epoch": 0.10824108241082411, "grad_norm": 1.4629731178283691, "learning_rate": 2.3902894994198286e-08, "loss": 0.8933, "step": 198 }, { "epoch": 0.1087877545442121, "grad_norm": 2.013901710510254, "learning_rate": 5.976233968155164e-09, "loss": 1.116, "step": 199 }, { "epoch": 0.10933442667760011, "grad_norm": 2.5633609294891357, "learning_rate": 0.0, "loss": 0.9893, "step": 200 }, { "epoch": 0.10933442667760011, "eval_loss": 0.8386966586112976, "eval_runtime": 94.776, "eval_samples_per_second": 32.508, "eval_steps_per_second": 8.135, "step": 200 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 4, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.813079334767821e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }