|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.10614772224679346, |
|
"eval_steps": 500, |
|
"global_step": 60, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0017691287041132243, |
|
"grad_norm": 1.9742480441813004, |
|
"learning_rate": 2e-08, |
|
"loss": 3.2598, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0035382574082264487, |
|
"grad_norm": 2.038016027037428, |
|
"learning_rate": 4e-08, |
|
"loss": 3.4531, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.005307386112339673, |
|
"grad_norm": 2.225136431627388, |
|
"learning_rate": 6e-08, |
|
"loss": 3.4746, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.007076514816452897, |
|
"grad_norm": 2.0178713550531033, |
|
"learning_rate": 8e-08, |
|
"loss": 3.293, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.008845643520566122, |
|
"grad_norm": 2.042908378033956, |
|
"learning_rate": 1e-07, |
|
"loss": 3.2383, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.010614772224679346, |
|
"grad_norm": 2.1329682870055464, |
|
"learning_rate": 1.2e-07, |
|
"loss": 3.2441, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.01238390092879257, |
|
"grad_norm": 2.1377356269998873, |
|
"learning_rate": 1.4e-07, |
|
"loss": 3.293, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.014153029632905795, |
|
"grad_norm": 2.0514676704041146, |
|
"learning_rate": 1.6e-07, |
|
"loss": 3.4004, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.01592215833701902, |
|
"grad_norm": 1.94775665931649, |
|
"learning_rate": 1.8e-07, |
|
"loss": 3.4355, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.017691287041132243, |
|
"grad_norm": 2.0977166835164653, |
|
"learning_rate": 2e-07, |
|
"loss": 3.3711, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.019460415745245468, |
|
"grad_norm": 2.192564032711453, |
|
"learning_rate": 2.1999999999999998e-07, |
|
"loss": 3.3398, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.021229544449358692, |
|
"grad_norm": 2.1498795965856914, |
|
"learning_rate": 2.4e-07, |
|
"loss": 3.1562, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.022998673153471916, |
|
"grad_norm": 2.109291509736264, |
|
"learning_rate": 2.6e-07, |
|
"loss": 3.3652, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.02476780185758514, |
|
"grad_norm": 2.044792800372603, |
|
"learning_rate": 2.8e-07, |
|
"loss": 3.2461, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.026536930561698365, |
|
"grad_norm": 2.167283994785129, |
|
"learning_rate": 3e-07, |
|
"loss": 3.3301, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.02830605926581159, |
|
"grad_norm": 1.995664320722997, |
|
"learning_rate": 3.2e-07, |
|
"loss": 3.3867, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.03007518796992481, |
|
"grad_norm": 2.00843521276344, |
|
"learning_rate": 3.4000000000000003e-07, |
|
"loss": 3.2363, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.03184431667403804, |
|
"grad_norm": 2.107294400286055, |
|
"learning_rate": 3.6e-07, |
|
"loss": 3.3809, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.03361344537815126, |
|
"grad_norm": 1.9965688324131208, |
|
"learning_rate": 3.7999999999999996e-07, |
|
"loss": 3.2637, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.03538257408226449, |
|
"grad_norm": 2.1690567393421936, |
|
"learning_rate": 4e-07, |
|
"loss": 3.2461, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03715170278637771, |
|
"grad_norm": 1.9509465820725813, |
|
"learning_rate": 4.1999999999999995e-07, |
|
"loss": 3.3359, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.038920831490490936, |
|
"grad_norm": 2.180359699431997, |
|
"learning_rate": 4.3999999999999997e-07, |
|
"loss": 3.498, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.040689960194604156, |
|
"grad_norm": 1.9303585281557267, |
|
"learning_rate": 4.6e-07, |
|
"loss": 3.4453, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.042459088898717384, |
|
"grad_norm": 2.105287899781242, |
|
"learning_rate": 4.8e-07, |
|
"loss": 3.4531, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.044228217602830605, |
|
"grad_norm": 1.996519659869237, |
|
"learning_rate": 5e-07, |
|
"loss": 3.3633, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.04599734630694383, |
|
"grad_norm": 2.0672497293218903, |
|
"learning_rate": 5.2e-07, |
|
"loss": 3.4746, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.047766475011057054, |
|
"grad_norm": 2.0187116926490165, |
|
"learning_rate": 5.4e-07, |
|
"loss": 3.3105, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.04953560371517028, |
|
"grad_norm": 2.185446736666104, |
|
"learning_rate": 5.6e-07, |
|
"loss": 3.1953, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.0513047324192835, |
|
"grad_norm": 1.9785091042817515, |
|
"learning_rate": 5.8e-07, |
|
"loss": 3.2207, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.05307386112339673, |
|
"grad_norm": 1.983411961208081, |
|
"learning_rate": 6e-07, |
|
"loss": 3.1953, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.05484298982750995, |
|
"grad_norm": 1.8887794910668352, |
|
"learning_rate": 6.2e-07, |
|
"loss": 3.127, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.05661211853162318, |
|
"grad_norm": 2.024592500623624, |
|
"learning_rate": 6.4e-07, |
|
"loss": 3.3652, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.0583812472357364, |
|
"grad_norm": 2.033056092327317, |
|
"learning_rate": 6.6e-07, |
|
"loss": 3.4629, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.06015037593984962, |
|
"grad_norm": 2.1137985890313646, |
|
"learning_rate": 6.800000000000001e-07, |
|
"loss": 3.4277, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.06191950464396285, |
|
"grad_norm": 2.135970317417631, |
|
"learning_rate": 7e-07, |
|
"loss": 3.5664, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.06368863334807608, |
|
"grad_norm": 1.9525141602052385, |
|
"learning_rate": 7.2e-07, |
|
"loss": 3.3164, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.0654577620521893, |
|
"grad_norm": 1.9679140574444143, |
|
"learning_rate": 7.4e-07, |
|
"loss": 3.1348, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.06722689075630252, |
|
"grad_norm": 2.0697308820659295, |
|
"learning_rate": 7.599999999999999e-07, |
|
"loss": 3.3574, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.06899601946041574, |
|
"grad_norm": 2.0879787228782463, |
|
"learning_rate": 7.799999999999999e-07, |
|
"loss": 3.3477, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.07076514816452897, |
|
"grad_norm": 2.0051097367804234, |
|
"learning_rate": 8e-07, |
|
"loss": 3.3477, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0725342768686422, |
|
"grad_norm": 2.039846964044792, |
|
"learning_rate": 8.199999999999999e-07, |
|
"loss": 3.2109, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.07430340557275542, |
|
"grad_norm": 2.13344976323939, |
|
"learning_rate": 8.399999999999999e-07, |
|
"loss": 3.3848, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.07607253427686864, |
|
"grad_norm": 2.2830511277961585, |
|
"learning_rate": 8.599999999999999e-07, |
|
"loss": 3.1602, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.07784166298098187, |
|
"grad_norm": 1.9427108734819927, |
|
"learning_rate": 8.799999999999999e-07, |
|
"loss": 3.1914, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.07961079168509509, |
|
"grad_norm": 1.9926391710215448, |
|
"learning_rate": 9e-07, |
|
"loss": 3.3711, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.08137992038920831, |
|
"grad_norm": 2.2237278323731107, |
|
"learning_rate": 9.2e-07, |
|
"loss": 3.4746, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.08314904909332153, |
|
"grad_norm": 2.123759872019136, |
|
"learning_rate": 9.399999999999999e-07, |
|
"loss": 3.3926, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.08491817779743477, |
|
"grad_norm": 2.138037893897646, |
|
"learning_rate": 9.6e-07, |
|
"loss": 3.377, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.08668730650154799, |
|
"grad_norm": 2.074234748374453, |
|
"learning_rate": 9.8e-07, |
|
"loss": 3.3457, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.08845643520566121, |
|
"grad_norm": 2.162562414477262, |
|
"learning_rate": 1e-06, |
|
"loss": 3.2148, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.09022556390977443, |
|
"grad_norm": 2.091661753228539, |
|
"learning_rate": 1.02e-06, |
|
"loss": 3.3613, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.09199469261388767, |
|
"grad_norm": 2.1815638012188963, |
|
"learning_rate": 1.04e-06, |
|
"loss": 3.2949, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.09376382131800089, |
|
"grad_norm": 2.127146363547092, |
|
"learning_rate": 1.06e-06, |
|
"loss": 3.4297, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.09553295002211411, |
|
"grad_norm": 2.201243987413546, |
|
"learning_rate": 1.08e-06, |
|
"loss": 3.1445, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.09730207872622733, |
|
"grad_norm": 2.1006629919292075, |
|
"learning_rate": 1.1e-06, |
|
"loss": 3.4238, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.09907120743034056, |
|
"grad_norm": 2.2056247586234115, |
|
"learning_rate": 1.12e-06, |
|
"loss": 3.3516, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.10084033613445378, |
|
"grad_norm": 2.300409874538962, |
|
"learning_rate": 1.1399999999999999e-06, |
|
"loss": 3.4766, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.102609464838567, |
|
"grad_norm": 2.10637739408632, |
|
"learning_rate": 1.16e-06, |
|
"loss": 3.3477, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.10437859354268023, |
|
"grad_norm": 2.047319466977983, |
|
"learning_rate": 1.18e-06, |
|
"loss": 3.3965, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.10614772224679346, |
|
"grad_norm": 2.0670306109309102, |
|
"learning_rate": 1.2e-06, |
|
"loss": 3.2266, |
|
"step": 60 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 565, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 5, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 15477887729664.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|