lesso's picture
Training in progress, step 150, checkpoint
a85ff56 verified
raw
history blame
27.7 kB
{
"best_metric": 2.4503767490386963,
"best_model_checkpoint": "miner_id_24/checkpoint-150",
"epoch": 0.11286681715575621,
"eval_steps": 50,
"global_step": 150,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0007524454477050414,
"grad_norm": 4.9338603019714355,
"learning_rate": 1.018e-05,
"loss": 4.511,
"step": 1
},
{
"epoch": 0.0007524454477050414,
"eval_loss": 2.8468708992004395,
"eval_runtime": 37.6411,
"eval_samples_per_second": 14.877,
"eval_steps_per_second": 3.719,
"step": 1
},
{
"epoch": 0.0015048908954100827,
"grad_norm": 5.828423500061035,
"learning_rate": 2.036e-05,
"loss": 3.8614,
"step": 2
},
{
"epoch": 0.002257336343115124,
"grad_norm": 7.995044708251953,
"learning_rate": 3.0539999999999996e-05,
"loss": 4.6689,
"step": 3
},
{
"epoch": 0.0030097817908201654,
"grad_norm": 6.929337024688721,
"learning_rate": 4.072e-05,
"loss": 4.2038,
"step": 4
},
{
"epoch": 0.003762227238525207,
"grad_norm": 6.514157772064209,
"learning_rate": 5.09e-05,
"loss": 4.6263,
"step": 5
},
{
"epoch": 0.004514672686230248,
"grad_norm": 6.524606227874756,
"learning_rate": 6.107999999999999e-05,
"loss": 4.8234,
"step": 6
},
{
"epoch": 0.005267118133935289,
"grad_norm": 9.753814697265625,
"learning_rate": 7.125999999999999e-05,
"loss": 4.544,
"step": 7
},
{
"epoch": 0.006019563581640331,
"grad_norm": 6.565380096435547,
"learning_rate": 8.144e-05,
"loss": 4.2002,
"step": 8
},
{
"epoch": 0.006772009029345372,
"grad_norm": 7.157611846923828,
"learning_rate": 9.162e-05,
"loss": 3.7329,
"step": 9
},
{
"epoch": 0.007524454477050414,
"grad_norm": 8.59189510345459,
"learning_rate": 0.0001018,
"loss": 5.0335,
"step": 10
},
{
"epoch": 0.008276899924755455,
"grad_norm": 10.278609275817871,
"learning_rate": 0.00010126421052631578,
"loss": 5.018,
"step": 11
},
{
"epoch": 0.009029345372460496,
"grad_norm": 11.457633018493652,
"learning_rate": 0.00010072842105263156,
"loss": 5.185,
"step": 12
},
{
"epoch": 0.009781790820165538,
"grad_norm": 8.473273277282715,
"learning_rate": 0.00010019263157894736,
"loss": 4.661,
"step": 13
},
{
"epoch": 0.010534236267870579,
"grad_norm": 9.23606014251709,
"learning_rate": 9.965684210526316e-05,
"loss": 5.1647,
"step": 14
},
{
"epoch": 0.011286681715575621,
"grad_norm": 10.785460472106934,
"learning_rate": 9.912105263157895e-05,
"loss": 5.2782,
"step": 15
},
{
"epoch": 0.012039127163280662,
"grad_norm": 9.376898765563965,
"learning_rate": 9.858526315789473e-05,
"loss": 4.4393,
"step": 16
},
{
"epoch": 0.012791572610985704,
"grad_norm": 10.466320037841797,
"learning_rate": 9.804947368421052e-05,
"loss": 5.3007,
"step": 17
},
{
"epoch": 0.013544018058690745,
"grad_norm": 8.497576713562012,
"learning_rate": 9.75136842105263e-05,
"loss": 4.941,
"step": 18
},
{
"epoch": 0.014296463506395787,
"grad_norm": 9.376507759094238,
"learning_rate": 9.69778947368421e-05,
"loss": 5.1504,
"step": 19
},
{
"epoch": 0.015048908954100828,
"grad_norm": 9.785571098327637,
"learning_rate": 9.644210526315789e-05,
"loss": 4.6566,
"step": 20
},
{
"epoch": 0.01580135440180587,
"grad_norm": 16.524185180664062,
"learning_rate": 9.590631578947369e-05,
"loss": 5.2649,
"step": 21
},
{
"epoch": 0.01655379984951091,
"grad_norm": 9.190619468688965,
"learning_rate": 9.537052631578947e-05,
"loss": 4.9649,
"step": 22
},
{
"epoch": 0.01730624529721595,
"grad_norm": 15.659117698669434,
"learning_rate": 9.483473684210526e-05,
"loss": 6.6098,
"step": 23
},
{
"epoch": 0.01805869074492099,
"grad_norm": 9.867484092712402,
"learning_rate": 9.429894736842104e-05,
"loss": 5.5985,
"step": 24
},
{
"epoch": 0.018811136192626036,
"grad_norm": 8.590432167053223,
"learning_rate": 9.376315789473684e-05,
"loss": 4.1838,
"step": 25
},
{
"epoch": 0.019563581640331076,
"grad_norm": 8.883073806762695,
"learning_rate": 9.322736842105262e-05,
"loss": 4.8669,
"step": 26
},
{
"epoch": 0.020316027088036117,
"grad_norm": 8.349895477294922,
"learning_rate": 9.269157894736842e-05,
"loss": 5.1803,
"step": 27
},
{
"epoch": 0.021068472535741158,
"grad_norm": 11.773541450500488,
"learning_rate": 9.215578947368421e-05,
"loss": 5.0873,
"step": 28
},
{
"epoch": 0.0218209179834462,
"grad_norm": 13.537124633789062,
"learning_rate": 9.162e-05,
"loss": 4.6728,
"step": 29
},
{
"epoch": 0.022573363431151242,
"grad_norm": 12.410065650939941,
"learning_rate": 9.108421052631578e-05,
"loss": 5.1454,
"step": 30
},
{
"epoch": 0.023325808878856283,
"grad_norm": 11.18578052520752,
"learning_rate": 9.054842105263158e-05,
"loss": 4.2279,
"step": 31
},
{
"epoch": 0.024078254326561323,
"grad_norm": 38.089595794677734,
"learning_rate": 9.001263157894736e-05,
"loss": 5.0972,
"step": 32
},
{
"epoch": 0.024830699774266364,
"grad_norm": 20.254100799560547,
"learning_rate": 8.947684210526315e-05,
"loss": 4.9312,
"step": 33
},
{
"epoch": 0.025583145221971408,
"grad_norm": 32.83479309082031,
"learning_rate": 8.894105263157895e-05,
"loss": 5.4251,
"step": 34
},
{
"epoch": 0.02633559066967645,
"grad_norm": 14.390934944152832,
"learning_rate": 8.840526315789473e-05,
"loss": 5.1836,
"step": 35
},
{
"epoch": 0.02708803611738149,
"grad_norm": 17.68592071533203,
"learning_rate": 8.786947368421052e-05,
"loss": 5.4996,
"step": 36
},
{
"epoch": 0.02784048156508653,
"grad_norm": 15.407584190368652,
"learning_rate": 8.733368421052632e-05,
"loss": 5.5683,
"step": 37
},
{
"epoch": 0.028592927012791574,
"grad_norm": 13.758222579956055,
"learning_rate": 8.67978947368421e-05,
"loss": 5.6531,
"step": 38
},
{
"epoch": 0.029345372460496615,
"grad_norm": 15.094158172607422,
"learning_rate": 8.626210526315789e-05,
"loss": 4.6223,
"step": 39
},
{
"epoch": 0.030097817908201655,
"grad_norm": 14.733675003051758,
"learning_rate": 8.572631578947367e-05,
"loss": 5.6096,
"step": 40
},
{
"epoch": 0.030850263355906696,
"grad_norm": 11.245537757873535,
"learning_rate": 8.519052631578947e-05,
"loss": 4.9552,
"step": 41
},
{
"epoch": 0.03160270880361174,
"grad_norm": 15.303187370300293,
"learning_rate": 8.465473684210527e-05,
"loss": 6.245,
"step": 42
},
{
"epoch": 0.03235515425131678,
"grad_norm": 12.705514907836914,
"learning_rate": 8.411894736842105e-05,
"loss": 5.5335,
"step": 43
},
{
"epoch": 0.03310759969902182,
"grad_norm": 12.836231231689453,
"learning_rate": 8.358315789473684e-05,
"loss": 5.8837,
"step": 44
},
{
"epoch": 0.033860045146726865,
"grad_norm": 12.260278701782227,
"learning_rate": 8.304736842105262e-05,
"loss": 4.22,
"step": 45
},
{
"epoch": 0.0346124905944319,
"grad_norm": 15.98351764678955,
"learning_rate": 8.251157894736841e-05,
"loss": 4.8508,
"step": 46
},
{
"epoch": 0.035364936042136946,
"grad_norm": 18.42877197265625,
"learning_rate": 8.197578947368421e-05,
"loss": 5.7646,
"step": 47
},
{
"epoch": 0.03611738148984198,
"grad_norm": 15.042816162109375,
"learning_rate": 8.144e-05,
"loss": 6.2102,
"step": 48
},
{
"epoch": 0.03686982693754703,
"grad_norm": 20.197011947631836,
"learning_rate": 8.090421052631579e-05,
"loss": 7.2784,
"step": 49
},
{
"epoch": 0.03762227238525207,
"grad_norm": 22.858545303344727,
"learning_rate": 8.036842105263158e-05,
"loss": 6.3548,
"step": 50
},
{
"epoch": 0.03762227238525207,
"eval_loss": 2.573653221130371,
"eval_runtime": 37.6194,
"eval_samples_per_second": 14.886,
"eval_steps_per_second": 3.721,
"step": 50
},
{
"epoch": 0.03837471783295711,
"grad_norm": 6.201414108276367,
"learning_rate": 7.983263157894736e-05,
"loss": 4.6951,
"step": 51
},
{
"epoch": 0.03912716328066215,
"grad_norm": 4.830835342407227,
"learning_rate": 7.929684210526315e-05,
"loss": 5.02,
"step": 52
},
{
"epoch": 0.0398796087283672,
"grad_norm": 7.732100009918213,
"learning_rate": 7.876105263157895e-05,
"loss": 4.9632,
"step": 53
},
{
"epoch": 0.040632054176072234,
"grad_norm": 7.422882556915283,
"learning_rate": 7.822526315789473e-05,
"loss": 4.2894,
"step": 54
},
{
"epoch": 0.04138449962377728,
"grad_norm": 6.019810676574707,
"learning_rate": 7.768947368421053e-05,
"loss": 4.5985,
"step": 55
},
{
"epoch": 0.042136945071482315,
"grad_norm": 7.448675632476807,
"learning_rate": 7.715368421052631e-05,
"loss": 5.6458,
"step": 56
},
{
"epoch": 0.04288939051918736,
"grad_norm": 6.525016784667969,
"learning_rate": 7.66178947368421e-05,
"loss": 4.4928,
"step": 57
},
{
"epoch": 0.0436418359668924,
"grad_norm": 5.862019062042236,
"learning_rate": 7.608210526315788e-05,
"loss": 5.0442,
"step": 58
},
{
"epoch": 0.04439428141459744,
"grad_norm": 6.698094844818115,
"learning_rate": 7.554631578947368e-05,
"loss": 5.1862,
"step": 59
},
{
"epoch": 0.045146726862302484,
"grad_norm": 5.901148796081543,
"learning_rate": 7.501052631578947e-05,
"loss": 4.6401,
"step": 60
},
{
"epoch": 0.04589917231000752,
"grad_norm": 8.506747245788574,
"learning_rate": 7.447473684210527e-05,
"loss": 5.6167,
"step": 61
},
{
"epoch": 0.046651617757712566,
"grad_norm": 8.143284797668457,
"learning_rate": 7.393894736842105e-05,
"loss": 4.5655,
"step": 62
},
{
"epoch": 0.04740406320541761,
"grad_norm": 5.302389621734619,
"learning_rate": 7.340315789473684e-05,
"loss": 4.4897,
"step": 63
},
{
"epoch": 0.04815650865312265,
"grad_norm": 5.529751300811768,
"learning_rate": 7.286736842105262e-05,
"loss": 4.2722,
"step": 64
},
{
"epoch": 0.04890895410082769,
"grad_norm": 8.762489318847656,
"learning_rate": 7.233157894736842e-05,
"loss": 5.1436,
"step": 65
},
{
"epoch": 0.04966139954853273,
"grad_norm": 11.295607566833496,
"learning_rate": 7.179578947368421e-05,
"loss": 4.9496,
"step": 66
},
{
"epoch": 0.05041384499623777,
"grad_norm": 9.971809387207031,
"learning_rate": 7.125999999999999e-05,
"loss": 5.4893,
"step": 67
},
{
"epoch": 0.051166290443942816,
"grad_norm": 7.344180583953857,
"learning_rate": 7.072421052631579e-05,
"loss": 4.6742,
"step": 68
},
{
"epoch": 0.05191873589164785,
"grad_norm": 9.844782829284668,
"learning_rate": 7.018842105263158e-05,
"loss": 5.1691,
"step": 69
},
{
"epoch": 0.0526711813393529,
"grad_norm": 8.95771598815918,
"learning_rate": 6.965263157894736e-05,
"loss": 4.879,
"step": 70
},
{
"epoch": 0.05342362678705794,
"grad_norm": 8.04973030090332,
"learning_rate": 6.911684210526316e-05,
"loss": 5.0598,
"step": 71
},
{
"epoch": 0.05417607223476298,
"grad_norm": 8.579294204711914,
"learning_rate": 6.858105263157894e-05,
"loss": 4.8055,
"step": 72
},
{
"epoch": 0.05492851768246802,
"grad_norm": 7.308038234710693,
"learning_rate": 6.804526315789473e-05,
"loss": 4.7293,
"step": 73
},
{
"epoch": 0.05568096313017306,
"grad_norm": 7.7795233726501465,
"learning_rate": 6.750947368421052e-05,
"loss": 5.0379,
"step": 74
},
{
"epoch": 0.056433408577878104,
"grad_norm": 8.343374252319336,
"learning_rate": 6.697368421052631e-05,
"loss": 4.4904,
"step": 75
},
{
"epoch": 0.05718585402558315,
"grad_norm": 7.44524621963501,
"learning_rate": 6.64378947368421e-05,
"loss": 4.7685,
"step": 76
},
{
"epoch": 0.057938299473288185,
"grad_norm": 10.718270301818848,
"learning_rate": 6.59021052631579e-05,
"loss": 5.2094,
"step": 77
},
{
"epoch": 0.05869074492099323,
"grad_norm": 9.407214164733887,
"learning_rate": 6.536631578947368e-05,
"loss": 5.0866,
"step": 78
},
{
"epoch": 0.059443190368698266,
"grad_norm": 8.373135566711426,
"learning_rate": 6.483052631578947e-05,
"loss": 4.8998,
"step": 79
},
{
"epoch": 0.06019563581640331,
"grad_norm": 8.654524803161621,
"learning_rate": 6.429473684210525e-05,
"loss": 4.6364,
"step": 80
},
{
"epoch": 0.060948081264108354,
"grad_norm": 9.621380805969238,
"learning_rate": 6.375894736842104e-05,
"loss": 5.1948,
"step": 81
},
{
"epoch": 0.06170052671181339,
"grad_norm": 13.150786399841309,
"learning_rate": 6.322315789473684e-05,
"loss": 5.4827,
"step": 82
},
{
"epoch": 0.062452972159518436,
"grad_norm": 10.533498764038086,
"learning_rate": 6.268736842105264e-05,
"loss": 4.9195,
"step": 83
},
{
"epoch": 0.06320541760722348,
"grad_norm": 10.708104133605957,
"learning_rate": 6.215157894736842e-05,
"loss": 4.8833,
"step": 84
},
{
"epoch": 0.06395786305492852,
"grad_norm": 7.892517566680908,
"learning_rate": 6.16157894736842e-05,
"loss": 5.0272,
"step": 85
},
{
"epoch": 0.06471030850263355,
"grad_norm": 13.007377624511719,
"learning_rate": 6.107999999999999e-05,
"loss": 4.8137,
"step": 86
},
{
"epoch": 0.0654627539503386,
"grad_norm": 11.001848220825195,
"learning_rate": 6.054421052631578e-05,
"loss": 5.8105,
"step": 87
},
{
"epoch": 0.06621519939804364,
"grad_norm": 13.360245704650879,
"learning_rate": 6.000842105263157e-05,
"loss": 5.2308,
"step": 88
},
{
"epoch": 0.06696764484574869,
"grad_norm": 8.78776741027832,
"learning_rate": 5.947263157894737e-05,
"loss": 4.3293,
"step": 89
},
{
"epoch": 0.06772009029345373,
"grad_norm": 11.788161277770996,
"learning_rate": 5.893684210526316e-05,
"loss": 5.5094,
"step": 90
},
{
"epoch": 0.06847253574115876,
"grad_norm": 13.817206382751465,
"learning_rate": 5.8401052631578944e-05,
"loss": 5.6535,
"step": 91
},
{
"epoch": 0.0692249811888638,
"grad_norm": 10.35663890838623,
"learning_rate": 5.7865263157894736e-05,
"loss": 4.9656,
"step": 92
},
{
"epoch": 0.06997742663656885,
"grad_norm": 12.754554748535156,
"learning_rate": 5.732947368421052e-05,
"loss": 6.037,
"step": 93
},
{
"epoch": 0.07072987208427389,
"grad_norm": 13.788698196411133,
"learning_rate": 5.6793684210526306e-05,
"loss": 5.4732,
"step": 94
},
{
"epoch": 0.07148231753197894,
"grad_norm": 10.369476318359375,
"learning_rate": 5.6257894736842105e-05,
"loss": 4.9698,
"step": 95
},
{
"epoch": 0.07223476297968397,
"grad_norm": 11.039383888244629,
"learning_rate": 5.57221052631579e-05,
"loss": 5.0991,
"step": 96
},
{
"epoch": 0.07298720842738901,
"grad_norm": 18.217975616455078,
"learning_rate": 5.518631578947368e-05,
"loss": 5.7764,
"step": 97
},
{
"epoch": 0.07373965387509406,
"grad_norm": 13.361612319946289,
"learning_rate": 5.4650526315789474e-05,
"loss": 4.8924,
"step": 98
},
{
"epoch": 0.0744920993227991,
"grad_norm": 24.20296287536621,
"learning_rate": 5.411473684210526e-05,
"loss": 6.2003,
"step": 99
},
{
"epoch": 0.07524454477050414,
"grad_norm": 16.34416389465332,
"learning_rate": 5.3578947368421044e-05,
"loss": 5.3467,
"step": 100
},
{
"epoch": 0.07524454477050414,
"eval_loss": 2.492374897003174,
"eval_runtime": 37.6751,
"eval_samples_per_second": 14.864,
"eval_steps_per_second": 3.716,
"step": 100
},
{
"epoch": 0.07599699021820917,
"grad_norm": 4.632258415222168,
"learning_rate": 5.3043157894736836e-05,
"loss": 4.8204,
"step": 101
},
{
"epoch": 0.07674943566591422,
"grad_norm": 5.53971004486084,
"learning_rate": 5.2507368421052635e-05,
"loss": 5.2553,
"step": 102
},
{
"epoch": 0.07750188111361926,
"grad_norm": 3.8668930530548096,
"learning_rate": 5.197157894736842e-05,
"loss": 4.3132,
"step": 103
},
{
"epoch": 0.0782543265613243,
"grad_norm": 5.11984920501709,
"learning_rate": 5.143578947368421e-05,
"loss": 4.5635,
"step": 104
},
{
"epoch": 0.07900677200902935,
"grad_norm": 4.6450347900390625,
"learning_rate": 5.09e-05,
"loss": 3.4141,
"step": 105
},
{
"epoch": 0.0797592174567344,
"grad_norm": 5.824936389923096,
"learning_rate": 5.036421052631578e-05,
"loss": 4.7563,
"step": 106
},
{
"epoch": 0.08051166290443942,
"grad_norm": 4.639711380004883,
"learning_rate": 4.982842105263158e-05,
"loss": 4.3026,
"step": 107
},
{
"epoch": 0.08126410835214447,
"grad_norm": 8.702680587768555,
"learning_rate": 4.9292631578947366e-05,
"loss": 5.7537,
"step": 108
},
{
"epoch": 0.08201655379984951,
"grad_norm": 5.901224613189697,
"learning_rate": 4.875684210526315e-05,
"loss": 5.0042,
"step": 109
},
{
"epoch": 0.08276899924755456,
"grad_norm": 8.818628311157227,
"learning_rate": 4.822105263157894e-05,
"loss": 5.3414,
"step": 110
},
{
"epoch": 0.0835214446952596,
"grad_norm": 7.103747367858887,
"learning_rate": 4.7685263157894735e-05,
"loss": 5.2836,
"step": 111
},
{
"epoch": 0.08427389014296463,
"grad_norm": 6.621494770050049,
"learning_rate": 4.714947368421052e-05,
"loss": 4.9538,
"step": 112
},
{
"epoch": 0.08502633559066967,
"grad_norm": 8.953717231750488,
"learning_rate": 4.661368421052631e-05,
"loss": 5.1009,
"step": 113
},
{
"epoch": 0.08577878103837472,
"grad_norm": 8.522113800048828,
"learning_rate": 4.6077894736842104e-05,
"loss": 5.0785,
"step": 114
},
{
"epoch": 0.08653122648607976,
"grad_norm": 6.302427291870117,
"learning_rate": 4.554210526315789e-05,
"loss": 4.6044,
"step": 115
},
{
"epoch": 0.0872836719337848,
"grad_norm": 8.132070541381836,
"learning_rate": 4.500631578947368e-05,
"loss": 4.9639,
"step": 116
},
{
"epoch": 0.08803611738148984,
"grad_norm": 7.749171733856201,
"learning_rate": 4.447052631578947e-05,
"loss": 5.2865,
"step": 117
},
{
"epoch": 0.08878856282919488,
"grad_norm": 5.969038963317871,
"learning_rate": 4.393473684210526e-05,
"loss": 4.7642,
"step": 118
},
{
"epoch": 0.08954100827689992,
"grad_norm": 6.512506484985352,
"learning_rate": 4.339894736842105e-05,
"loss": 2.9121,
"step": 119
},
{
"epoch": 0.09029345372460497,
"grad_norm": 6.616455554962158,
"learning_rate": 4.2863157894736835e-05,
"loss": 5.1259,
"step": 120
},
{
"epoch": 0.09104589917231001,
"grad_norm": 8.678909301757812,
"learning_rate": 4.2327368421052634e-05,
"loss": 5.4049,
"step": 121
},
{
"epoch": 0.09179834462001504,
"grad_norm": 7.853146553039551,
"learning_rate": 4.179157894736842e-05,
"loss": 5.1167,
"step": 122
},
{
"epoch": 0.09255079006772009,
"grad_norm": 6.326202392578125,
"learning_rate": 4.1255789473684204e-05,
"loss": 5.1124,
"step": 123
},
{
"epoch": 0.09330323551542513,
"grad_norm": 6.513983726501465,
"learning_rate": 4.072e-05,
"loss": 4.8156,
"step": 124
},
{
"epoch": 0.09405568096313018,
"grad_norm": 7.685911178588867,
"learning_rate": 4.018421052631579e-05,
"loss": 4.3464,
"step": 125
},
{
"epoch": 0.09480812641083522,
"grad_norm": 8.669236183166504,
"learning_rate": 3.9648421052631573e-05,
"loss": 4.7814,
"step": 126
},
{
"epoch": 0.09556057185854025,
"grad_norm": 7.881282806396484,
"learning_rate": 3.9112631578947365e-05,
"loss": 5.0522,
"step": 127
},
{
"epoch": 0.0963130173062453,
"grad_norm": 6.99576473236084,
"learning_rate": 3.857684210526316e-05,
"loss": 4.4357,
"step": 128
},
{
"epoch": 0.09706546275395034,
"grad_norm": 8.76285171508789,
"learning_rate": 3.804105263157894e-05,
"loss": 5.5137,
"step": 129
},
{
"epoch": 0.09781790820165538,
"grad_norm": 7.629359245300293,
"learning_rate": 3.7505263157894734e-05,
"loss": 4.3788,
"step": 130
},
{
"epoch": 0.09857035364936043,
"grad_norm": 9.962780952453613,
"learning_rate": 3.6969473684210526e-05,
"loss": 4.5262,
"step": 131
},
{
"epoch": 0.09932279909706546,
"grad_norm": 7.68848180770874,
"learning_rate": 3.643368421052631e-05,
"loss": 4.803,
"step": 132
},
{
"epoch": 0.1000752445447705,
"grad_norm": 12.457582473754883,
"learning_rate": 3.5897894736842103e-05,
"loss": 5.286,
"step": 133
},
{
"epoch": 0.10082768999247554,
"grad_norm": 10.64576244354248,
"learning_rate": 3.5362105263157895e-05,
"loss": 5.5489,
"step": 134
},
{
"epoch": 0.10158013544018059,
"grad_norm": 8.363515853881836,
"learning_rate": 3.482631578947368e-05,
"loss": 4.724,
"step": 135
},
{
"epoch": 0.10233258088788563,
"grad_norm": 9.08327579498291,
"learning_rate": 3.429052631578947e-05,
"loss": 4.5964,
"step": 136
},
{
"epoch": 0.10308502633559068,
"grad_norm": 10.975964546203613,
"learning_rate": 3.375473684210526e-05,
"loss": 4.7091,
"step": 137
},
{
"epoch": 0.1038374717832957,
"grad_norm": 8.273202896118164,
"learning_rate": 3.321894736842105e-05,
"loss": 3.8967,
"step": 138
},
{
"epoch": 0.10458991723100075,
"grad_norm": 10.234407424926758,
"learning_rate": 3.268315789473684e-05,
"loss": 5.5976,
"step": 139
},
{
"epoch": 0.1053423626787058,
"grad_norm": 8.687202453613281,
"learning_rate": 3.2147368421052627e-05,
"loss": 4.7047,
"step": 140
},
{
"epoch": 0.10609480812641084,
"grad_norm": 9.25235652923584,
"learning_rate": 3.161157894736842e-05,
"loss": 4.7048,
"step": 141
},
{
"epoch": 0.10684725357411588,
"grad_norm": 10.904390335083008,
"learning_rate": 3.107578947368421e-05,
"loss": 5.0487,
"step": 142
},
{
"epoch": 0.10759969902182091,
"grad_norm": 12.776407241821289,
"learning_rate": 3.0539999999999996e-05,
"loss": 5.6051,
"step": 143
},
{
"epoch": 0.10835214446952596,
"grad_norm": 10.124897003173828,
"learning_rate": 3.0004210526315784e-05,
"loss": 5.6051,
"step": 144
},
{
"epoch": 0.109104589917231,
"grad_norm": 10.322992324829102,
"learning_rate": 2.946842105263158e-05,
"loss": 4.5447,
"step": 145
},
{
"epoch": 0.10985703536493605,
"grad_norm": 17.68702507019043,
"learning_rate": 2.8932631578947368e-05,
"loss": 5.1524,
"step": 146
},
{
"epoch": 0.11060948081264109,
"grad_norm": 14.997350692749023,
"learning_rate": 2.8396842105263153e-05,
"loss": 5.7192,
"step": 147
},
{
"epoch": 0.11136192626034612,
"grad_norm": 12.031723022460938,
"learning_rate": 2.786105263157895e-05,
"loss": 5.2456,
"step": 148
},
{
"epoch": 0.11211437170805116,
"grad_norm": 17.6466007232666,
"learning_rate": 2.7325263157894737e-05,
"loss": 6.2655,
"step": 149
},
{
"epoch": 0.11286681715575621,
"grad_norm": 18.988000869750977,
"learning_rate": 2.6789473684210522e-05,
"loss": 7.0167,
"step": 150
},
{
"epoch": 0.11286681715575621,
"eval_loss": 2.4503767490386963,
"eval_runtime": 37.6662,
"eval_samples_per_second": 14.867,
"eval_steps_per_second": 3.717,
"step": 150
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.437792434153062e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}