|
{ |
|
"best_metric": NaN, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-50", |
|
"epoch": 0.09918175055789735, |
|
"eval_steps": 50, |
|
"global_step": 200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0004959087527894868, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.333333333333333e-06, |
|
"loss": 0.0, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0004959087527894868, |
|
"eval_loss": NaN, |
|
"eval_runtime": 338.9156, |
|
"eval_samples_per_second": 10.023, |
|
"eval_steps_per_second": 2.508, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0009918175055789735, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.666666666666666e-06, |
|
"loss": 0.0, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0014877262583684603, |
|
"grad_norm": NaN, |
|
"learning_rate": 7e-06, |
|
"loss": 0.0, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.001983635011157947, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.333333333333333e-06, |
|
"loss": 0.0, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.002479543763947434, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.1666666666666665e-05, |
|
"loss": 0.0, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0029754525167369206, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.4e-05, |
|
"loss": 0.0, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0034713612695264073, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.633333333333333e-05, |
|
"loss": 0.0, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.003967270022315894, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.8666666666666665e-05, |
|
"loss": 0.0, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.004463178775105381, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.1e-05, |
|
"loss": 0.0, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.004959087527894868, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.333333333333333e-05, |
|
"loss": 0.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.005454996280684354, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.5666666666666663e-05, |
|
"loss": 0.0, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.005950905033473841, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.8e-05, |
|
"loss": 0.0, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.006446813786263328, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.0333333333333333e-05, |
|
"loss": 0.0, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.006942722539052815, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.266666666666666e-05, |
|
"loss": 0.0, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.007438631291842301, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.5e-05, |
|
"loss": 0.0, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.007934540044631788, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.733333333333333e-05, |
|
"loss": 0.0, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.008430448797421275, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.9666666666666664e-05, |
|
"loss": 0.0, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.008926357550210762, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.2e-05, |
|
"loss": 0.0, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.009422266303000248, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.4333333333333324e-05, |
|
"loss": 4.6481, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.009918175055789735, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.666666666666666e-05, |
|
"loss": 21.5072, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.010414083808579222, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.899999999999999e-05, |
|
"loss": 0.0, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.010909992561368709, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.1333333333333325e-05, |
|
"loss": 0.0, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.011405901314158196, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.3666666666666666e-05, |
|
"loss": 0.0, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.011901810066947682, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.6e-05, |
|
"loss": 0.0, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.012397718819737169, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.833333333333333e-05, |
|
"loss": 0.0, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.012893627572526656, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.0666666666666666e-05, |
|
"loss": 0.0, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.013389536325316143, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.3e-05, |
|
"loss": 0.0, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.01388544507810563, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.533333333333333e-05, |
|
"loss": 0.0, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.014381353830895116, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.766666666666667e-05, |
|
"loss": 0.0, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.014877262583684603, |
|
"grad_norm": NaN, |
|
"learning_rate": 7e-05, |
|
"loss": 0.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.01537317133647409, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.999402376603183e-05, |
|
"loss": 0.0, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.015869080089263576, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.99760971050058e-05, |
|
"loss": 0.0, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.016364988842053063, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.994622613886018e-05, |
|
"loss": 0.0, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.01686089759484255, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.990442106850258e-05, |
|
"loss": 0.0, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.017356806347632037, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.98506961703262e-05, |
|
"loss": 0.0, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.017852715100421523, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.978506979133457e-05, |
|
"loss": 0.0, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.01834862385321101, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.9707564342876e-05, |
|
"loss": 0.0, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.018844532606000497, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.96182062929901e-05, |
|
"loss": 0.0, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.019340441358789984, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.951702615736908e-05, |
|
"loss": 0.0, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.01983635011157947, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.940405848893656e-05, |
|
"loss": 0.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.020332258864368957, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.92793418660478e-05, |
|
"loss": 0.0, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.020828167617158444, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.914291887931528e-05, |
|
"loss": 0.0, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.02132407636994793, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.899483611706398e-05, |
|
"loss": 0.0, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.021819985122737418, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.883514414942155e-05, |
|
"loss": 0.0, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.022315893875526904, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.866389751104867e-05, |
|
"loss": 0.0, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.02281180262831639, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.848115468251542e-05, |
|
"loss": 0.0, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.023307711381105878, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.828697807033038e-05, |
|
"loss": 0.0, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.023803620133895365, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.808143398562868e-05, |
|
"loss": 0.0, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.02429952888668485, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.786459262152698e-05, |
|
"loss": 0.0, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.024795437639474338, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.763652802915244e-05, |
|
"loss": 0.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.024795437639474338, |
|
"eval_loss": NaN, |
|
"eval_runtime": 337.9793, |
|
"eval_samples_per_second": 10.051, |
|
"eval_steps_per_second": 2.515, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.025291346392263825, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.739731809235446e-05, |
|
"loss": 0.0, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.02578725514505331, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.71470445011073e-05, |
|
"loss": 0.0, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.0262831638978428, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.688579272361309e-05, |
|
"loss": 0.0, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.026779072650632285, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.66136519771145e-05, |
|
"loss": 0.0, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.027274981403421772, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.633071519742718e-05, |
|
"loss": 0.0, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.02777089015621126, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.603707900720217e-05, |
|
"loss": 0.0, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.028266798909000745, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.573284368292943e-05, |
|
"loss": 0.0, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.028762707661790232, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.541811312069348e-05, |
|
"loss": 0.0, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.02925861641457972, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.509299480069303e-05, |
|
"loss": 0.0, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.029754525167369206, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.47575997505365e-05, |
|
"loss": 0.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.030250433920158692, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.441204250732624e-05, |
|
"loss": 0.0, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.03074634267294818, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.405644107854427e-05, |
|
"loss": 0.0, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.031242251425737666, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.369091690175273e-05, |
|
"loss": 0.0, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.03173816017852715, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.331559480312315e-05, |
|
"loss": 0.0, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.03223406893131664, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.293060295480838e-05, |
|
"loss": 0.0, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.032729977684106126, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.25360728311719e-05, |
|
"loss": 0.0, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.03322588643689561, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.213213916388954e-05, |
|
"loss": 0.0, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.0337217951896851, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.171893989593859e-05, |
|
"loss": 0.8073, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.03421770394247459, |
|
"grad_norm": 26.310998916625977, |
|
"learning_rate": 6.129661613449057e-05, |
|
"loss": 6.619, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.03471361269526407, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.086531210272306e-05, |
|
"loss": 81.0255, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.03520952144805356, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.042517509056784e-05, |
|
"loss": 0.0, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.03570543020084305, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.997635540441133e-05, |
|
"loss": 0.0, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.036201338953632534, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.9519006315765176e-05, |
|
"loss": 0.0, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.03669724770642202, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.9053284008924185e-05, |
|
"loss": 0.0, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.03719315645921151, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.85793475276295e-05, |
|
"loss": 0.0, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.037689065212000994, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.809735872075529e-05, |
|
"loss": 0.0, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.03818497396479048, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.760748218703755e-05, |
|
"loss": 0.0, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.03868088271757997, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.710988521886378e-05, |
|
"loss": 0.0, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.039176791470369454, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.660473774514275e-05, |
|
"loss": 0.0, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.03967270022315894, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.6092212273273975e-05, |
|
"loss": 0.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.04016860897594843, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.557248383023655e-05, |
|
"loss": 0.0, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.040664517728737914, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.5045729902817676e-05, |
|
"loss": 0.0, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.0411604264815274, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.4512130377000987e-05, |
|
"loss": 0.0, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.04165633523431689, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.397186747653573e-05, |
|
"loss": 0.0, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.042152243987106375, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.342512570070745e-05, |
|
"loss": 0.0, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.04264815273989586, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.287209176133174e-05, |
|
"loss": 0.0, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.04314406149268535, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.231295451899226e-05, |
|
"loss": 0.0, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.043639970245474835, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.174790491854502e-05, |
|
"loss": 0.0, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.04413587899826432, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.117713592391096e-05, |
|
"loss": 0.0, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.04463178775105381, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.060084245217884e-05, |
|
"loss": 0.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.045127696503843295, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.0019221307041306e-05, |
|
"loss": 0.0, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.04562360525663278, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.943247111158662e-05, |
|
"loss": 0.0, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.04611951400942227, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.884079224046898e-05, |
|
"loss": 0.0, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.046615422762211756, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.824438675148086e-05, |
|
"loss": 0.0, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.04711133151500124, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.764345831655036e-05, |
|
"loss": 0.0, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.04760724026779073, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.703821215218748e-05, |
|
"loss": 0.0, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.048103149020580216, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.642885494940291e-05, |
|
"loss": 0.0, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.0485990577733697, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.581559480312316e-05, |
|
"loss": 0.0, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.04909496652615919, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.519864114112636e-05, |
|
"loss": 0.0, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.049590875278948676, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.45782046525229e-05, |
|
"loss": 0.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.049590875278948676, |
|
"eval_loss": NaN, |
|
"eval_runtime": 338.0774, |
|
"eval_samples_per_second": 10.048, |
|
"eval_steps_per_second": 2.514, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.05008678403173816, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.3954497215805244e-05, |
|
"loss": 0.0, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.05058269278452765, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.332773182649165e-05, |
|
"loss": 0.0, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.051078601537317136, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.2698122524388405e-05, |
|
"loss": 0.0, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.05157451029010662, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.206588432049535e-05, |
|
"loss": 0.0, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.05207041904289611, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.143123312357996e-05, |
|
"loss": 0.0, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.0525663277956856, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.079438566644454e-05, |
|
"loss": 0.0, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.053062236548475084, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.015555943191231e-05, |
|
"loss": 0.0, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.05355814530126457, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.9514972578557114e-05, |
|
"loss": 0.0, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.05405405405405406, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.8872843866202525e-05, |
|
"loss": 0.0, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.054549962806843544, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.8229392581215565e-05, |
|
"loss": 0.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.05504587155963303, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.7584838461620587e-05, |
|
"loss": 0.0, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.05554178031242252, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.693940162205895e-05, |
|
"loss": 0.0, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.056037689065212004, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.629330247862007e-05, |
|
"loss": 0.0, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.05653359781800149, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.564676167356954e-05, |
|
"loss": 0.0, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.05702950657079098, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.5e-05, |
|
"loss": 0.0, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.057525415323580464, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.435323832643046e-05, |
|
"loss": 0.0, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.05802132407636995, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.370669752137993e-05, |
|
"loss": 0.0, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.05851723282915944, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.306059837794105e-05, |
|
"loss": 0.0, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.059013141581948925, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.241516153837941e-05, |
|
"loss": 0.0, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.05950905033473841, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.177060741878443e-05, |
|
"loss": 2.3783, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.0600049590875279, |
|
"grad_norm": 10.533272743225098, |
|
"learning_rate": 3.1127156133797475e-05, |
|
"loss": 5.9216, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.060500867840317385, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.048502742144289e-05, |
|
"loss": 80.022, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.06099677659310687, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.984444056808768e-05, |
|
"loss": 0.0, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.06149268534589636, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.9205614333555444e-05, |
|
"loss": 0.0, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.061988594098685845, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.856876687642003e-05, |
|
"loss": 0.0, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.06248450285147533, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.7934115679504645e-05, |
|
"loss": 0.0, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.06298041160426482, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.7301877475611606e-05, |
|
"loss": 0.0, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.0634763203570543, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.667226817350835e-05, |
|
"loss": 0.0, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.06397222910984379, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.604550278419475e-05, |
|
"loss": 0.0, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.06446813786263328, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.54217953474771e-05, |
|
"loss": 0.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.06496404661542277, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.4801358858873636e-05, |
|
"loss": 0.0, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.06545995536821225, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.4184405196876842e-05, |
|
"loss": 0.0, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.06595586412100174, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.3571145050597088e-05, |
|
"loss": 0.0, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.06645177287379123, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.296178784781251e-05, |
|
"loss": 0.0, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.06694768162658071, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.2356541683449646e-05, |
|
"loss": 0.0, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.0674435903793702, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.175561324851914e-05, |
|
"loss": 0.0, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.06793949913215969, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.1159207759531013e-05, |
|
"loss": 0.0, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.06843540788494917, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.0567528888413382e-05, |
|
"loss": 0.0, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.06893131663773866, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.9980778692958684e-05, |
|
"loss": 0.0, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.06942722539052815, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.9399157547821162e-05, |
|
"loss": 0.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.06992313414331763, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.882286407608904e-05, |
|
"loss": 0.0, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.07041904289610712, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.825209508145497e-05, |
|
"loss": 0.0, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.07091495164889661, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.7687045481007746e-05, |
|
"loss": 0.0, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.0714108604016861, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.712790823866826e-05, |
|
"loss": 0.0, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.07190676915447558, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.657487429929254e-05, |
|
"loss": 0.0, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.07240267790726507, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.602813252346427e-05, |
|
"loss": 0.0, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.07289858666005455, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.5487869622999004e-05, |
|
"loss": 0.0, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.07339449541284404, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.4954270097182317e-05, |
|
"loss": 0.0, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.07389040416563353, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.4427516169763444e-05, |
|
"loss": 0.0, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.07438631291842301, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.3907787726726029e-05, |
|
"loss": 0.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.07438631291842301, |
|
"eval_loss": NaN, |
|
"eval_runtime": 339.1066, |
|
"eval_samples_per_second": 10.017, |
|
"eval_steps_per_second": 2.507, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0748822216712125, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.339526225485725e-05, |
|
"loss": 0.0, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.07537813042400199, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.2890114781136224e-05, |
|
"loss": 0.0, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.07587403917679147, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.239251781296245e-05, |
|
"loss": 0.0, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.07636994792958096, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.1902641279244715e-05, |
|
"loss": 0.0, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.07686585668237045, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.1420652472370497e-05, |
|
"loss": 0.0, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.07736176543515993, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.0946715991075805e-05, |
|
"loss": 0.0, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.07785767418794942, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.0480993684234815e-05, |
|
"loss": 0.0, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.07835358294073891, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.0023644595588671e-05, |
|
"loss": 0.0, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.0788494916935284, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.57482490943216e-06, |
|
"loss": 0.0, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.07934540044631788, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.134687897276934e-06, |
|
"loss": 0.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.07984130919910737, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.703383865509432e-06, |
|
"loss": 0.0, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.08033721795189686, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.281060104061394e-06, |
|
"loss": 0.0, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.08083312670468634, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.867860836110453e-06, |
|
"loss": 0.0, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.08132903545747583, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.463927168828087e-06, |
|
"loss": 0.0, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.08182494421026532, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.069397045191617e-06, |
|
"loss": 0.0, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.0823208529630548, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.684405196876842e-06, |
|
"loss": 0.0, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.08281676171584429, |
|
"grad_norm": NaN, |
|
"learning_rate": 6.309083098247264e-06, |
|
"loss": 0.0, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.08331267046863378, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.943558921455733e-06, |
|
"loss": 0.0, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.08380857922142326, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.587957492673759e-06, |
|
"loss": 3.2668, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.08430448797421275, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.2424002494635095e-06, |
|
"loss": 50.263, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.08480039672700224, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.9070051993069636e-06, |
|
"loss": 0.0, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.08529630547979172, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.581886879306507e-06, |
|
"loss": 0.0, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.08579221423258121, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.2671563170705725e-06, |
|
"loss": 0.0, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.0862881229853707, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.962920992797834e-06, |
|
"loss": 0.0, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.08678403173816018, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.6692848025728216e-06, |
|
"loss": 0.0, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.08727994049094967, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.38634802288549e-06, |
|
"loss": 0.0, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.08777584924373916, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.1142072763869042e-06, |
|
"loss": 0.0, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.08827175799652864, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.852955498892694e-06, |
|
"loss": 0.0, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.08876766674931813, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.6026819076455325e-06, |
|
"loss": 0.0, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.08926357550210762, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.36347197084755e-06, |
|
"loss": 0.0, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.0897594842548971, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.1354073784730253e-06, |
|
"loss": 0.0, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.09025539300768659, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.9185660143713184e-06, |
|
"loss": 0.0, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.09075130176047608, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.7130219296696263e-06, |
|
"loss": 0.0, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.09124721051326556, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.5188453174845743e-06, |
|
"loss": 0.0, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.09174311926605505, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.3361024889513333e-06, |
|
"loss": 0.0, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.09223902801884454, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.16485585057844e-06, |
|
"loss": 0.0, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.09273493677163402, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.0051638829360127e-06, |
|
"loss": 0.0, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.09323084552442351, |
|
"grad_norm": NaN, |
|
"learning_rate": 8.570811206847189e-07, |
|
"loss": 0.0, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.093726754277213, |
|
"grad_norm": NaN, |
|
"learning_rate": 7.206581339521939e-07, |
|
"loss": 0.0, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.09422266303000248, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.959415110634375e-07, |
|
"loss": 0.0, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.09471857178279197, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.829738426309099e-07, |
|
"loss": 0.0, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.09521448053558146, |
|
"grad_norm": NaN, |
|
"learning_rate": 3.817937070098914e-07, |
|
"loss": 0.0, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.09571038928837095, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.9243565712400384e-07, |
|
"loss": 0.0, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.09620629804116043, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.1493020866542365e-07, |
|
"loss": 0.0, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.09670220679394992, |
|
"grad_norm": NaN, |
|
"learning_rate": 1.4930382967379363e-07, |
|
"loss": 0.0, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.0971981155467394, |
|
"grad_norm": NaN, |
|
"learning_rate": 9.557893149741924e-08, |
|
"loss": 0.0, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.09769402429952889, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.377386113981197e-08, |
|
"loss": 0.0, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.09818993305231838, |
|
"grad_norm": NaN, |
|
"learning_rate": 2.3902894994198286e-08, |
|
"loss": 0.0, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.09868584180510787, |
|
"grad_norm": NaN, |
|
"learning_rate": 5.976233968155164e-09, |
|
"loss": 0.0, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.09918175055789735, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.0, |
|
"loss": 0.0, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.09918175055789735, |
|
"eval_loss": NaN, |
|
"eval_runtime": 338.2367, |
|
"eval_samples_per_second": 10.043, |
|
"eval_steps_per_second": 2.513, |
|
"step": 200 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 200, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 4, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 3 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.875744616724562e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|