|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.8, |
|
"eval_steps": 500, |
|
"global_step": 50000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 35.146240234375, |
|
"learning_rate": 4.9004900490049e-05, |
|
"loss": 10.1867, |
|
"mean_token_accuracy": 0.544253178048879, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 32.855838775634766, |
|
"learning_rate": 4.8004800480048006e-05, |
|
"loss": 10.2308, |
|
"mean_token_accuracy": 0.5428702699318528, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 36.6450309753418, |
|
"learning_rate": 4.700470047004701e-05, |
|
"loss": 10.2215, |
|
"mean_token_accuracy": 0.5424384255111218, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 33.15324783325195, |
|
"learning_rate": 4.6004600460046006e-05, |
|
"loss": 10.2813, |
|
"mean_token_accuracy": 0.5411355452165008, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 31.203840255737305, |
|
"learning_rate": 4.500450045004501e-05, |
|
"loss": 10.2447, |
|
"mean_token_accuracy": 0.5425998609103262, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 29.603290557861328, |
|
"learning_rate": 4.4004400440044006e-05, |
|
"loss": 10.131, |
|
"mean_token_accuracy": 0.5463594534434378, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"grad_norm": 31.61845588684082, |
|
"learning_rate": 4.3004300430043e-05, |
|
"loss": 10.0479, |
|
"mean_token_accuracy": 0.5483995637446641, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 31.28079605102539, |
|
"learning_rate": 4.2004200420042006e-05, |
|
"loss": 9.9802, |
|
"mean_token_accuracy": 0.5492958616241813, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 28.147220611572266, |
|
"learning_rate": 4.100410041004101e-05, |
|
"loss": 9.8445, |
|
"mean_token_accuracy": 0.5530450477898121, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 26.830713272094727, |
|
"learning_rate": 4.0004000400040005e-05, |
|
"loss": 9.8066, |
|
"mean_token_accuracy": 0.5538738285191357, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"grad_norm": 28.744468688964844, |
|
"learning_rate": 3.900390039003901e-05, |
|
"loss": 9.7206, |
|
"mean_token_accuracy": 0.5572981600053608, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 25.372802734375, |
|
"learning_rate": 3.8003800380038005e-05, |
|
"loss": 9.6634, |
|
"mean_token_accuracy": 0.5570068260915577, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"grad_norm": 31.25323486328125, |
|
"learning_rate": 3.7003700370037e-05, |
|
"loss": 9.5426, |
|
"mean_token_accuracy": 0.5603208757042885, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 33.54015350341797, |
|
"learning_rate": 3.6003600360036005e-05, |
|
"loss": 9.532, |
|
"mean_token_accuracy": 0.561353420805186, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 30.744821548461914, |
|
"learning_rate": 3.500350035003501e-05, |
|
"loss": 9.4778, |
|
"mean_token_accuracy": 0.5629392731450498, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 24.013673782348633, |
|
"learning_rate": 3.4003400340034005e-05, |
|
"loss": 9.477, |
|
"mean_token_accuracy": 0.5625265723504126, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.272, |
|
"grad_norm": 27.767776489257812, |
|
"learning_rate": 3.300330033003301e-05, |
|
"loss": 9.3889, |
|
"mean_token_accuracy": 0.5659195666387677, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 29.1698055267334, |
|
"learning_rate": 3.2003200320032004e-05, |
|
"loss": 9.3476, |
|
"mean_token_accuracy": 0.5666751223653554, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.304, |
|
"grad_norm": 29.41615867614746, |
|
"learning_rate": 3.1003100310031e-05, |
|
"loss": 9.3244, |
|
"mean_token_accuracy": 0.5674368364065886, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 23.839937210083008, |
|
"learning_rate": 3.0003000300030004e-05, |
|
"loss": 9.2717, |
|
"mean_token_accuracy": 0.5696731022559106, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"grad_norm": 28.645061492919922, |
|
"learning_rate": 2.9002900290029007e-05, |
|
"loss": 9.2327, |
|
"mean_token_accuracy": 0.5706070831567049, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 26.104412078857422, |
|
"learning_rate": 2.8002800280028004e-05, |
|
"loss": 9.223, |
|
"mean_token_accuracy": 0.5696454518660903, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.368, |
|
"grad_norm": 28.753164291381836, |
|
"learning_rate": 2.7002700270027004e-05, |
|
"loss": 9.145, |
|
"mean_token_accuracy": 0.5734736853465437, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 21.9370174407959, |
|
"learning_rate": 2.6002600260026007e-05, |
|
"loss": 9.1538, |
|
"mean_token_accuracy": 0.5731835125163197, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 31.202007293701172, |
|
"learning_rate": 2.5002500250025003e-05, |
|
"loss": 9.1016, |
|
"mean_token_accuracy": 0.5749420530423522, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"grad_norm": 24.899829864501953, |
|
"learning_rate": 2.4002400240024003e-05, |
|
"loss": 9.1086, |
|
"mean_token_accuracy": 0.5731981860995292, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.432, |
|
"grad_norm": 26.59105682373047, |
|
"learning_rate": 2.3002300230023003e-05, |
|
"loss": 9.0585, |
|
"mean_token_accuracy": 0.575087952144444, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 27.35274314880371, |
|
"learning_rate": 2.2002200220022003e-05, |
|
"loss": 9.0378, |
|
"mean_token_accuracy": 0.5757604394182563, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.464, |
|
"grad_norm": 23.581249237060547, |
|
"learning_rate": 2.1002100210021003e-05, |
|
"loss": 9.087, |
|
"mean_token_accuracy": 0.5731492869332433, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 26.905712127685547, |
|
"learning_rate": 2.0002000200020003e-05, |
|
"loss": 9.0834, |
|
"mean_token_accuracy": 0.5751373803690076, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.496, |
|
"grad_norm": 24.928512573242188, |
|
"learning_rate": 1.9001900190019003e-05, |
|
"loss": 9.0077, |
|
"mean_token_accuracy": 0.5775581553503871, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 28.373720169067383, |
|
"learning_rate": 1.8001800180018002e-05, |
|
"loss": 9.0328, |
|
"mean_token_accuracy": 0.575654436133802, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.528, |
|
"grad_norm": 26.213802337646484, |
|
"learning_rate": 1.7001700170017002e-05, |
|
"loss": 8.9223, |
|
"mean_token_accuracy": 0.5791815776266158, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"grad_norm": 27.070953369140625, |
|
"learning_rate": 1.6001600160016002e-05, |
|
"loss": 8.9483, |
|
"mean_token_accuracy": 0.5792850709185005, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 22.90890884399414, |
|
"learning_rate": 1.5001500150015002e-05, |
|
"loss": 9.0419, |
|
"mean_token_accuracy": 0.5748572928607464, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 28.693235397338867, |
|
"learning_rate": 1.4001400140014002e-05, |
|
"loss": 8.951, |
|
"mean_token_accuracy": 0.5788668767511844, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.592, |
|
"grad_norm": 27.749176025390625, |
|
"learning_rate": 1.3001300130013003e-05, |
|
"loss": 8.9335, |
|
"mean_token_accuracy": 0.5779373695105314, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"grad_norm": 25.057411193847656, |
|
"learning_rate": 1.2001200120012002e-05, |
|
"loss": 8.8612, |
|
"mean_token_accuracy": 0.5811370112374425, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.624, |
|
"grad_norm": 26.132497787475586, |
|
"learning_rate": 1.1001100110011001e-05, |
|
"loss": 8.8688, |
|
"mean_token_accuracy": 0.5816743801310659, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 26.350906372070312, |
|
"learning_rate": 1.0001000100010001e-05, |
|
"loss": 8.8358, |
|
"mean_token_accuracy": 0.5821756240203977, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.656, |
|
"grad_norm": 24.874052047729492, |
|
"learning_rate": 9.000900090009001e-06, |
|
"loss": 8.8207, |
|
"mean_token_accuracy": 0.5826298766359687, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"grad_norm": 26.102046966552734, |
|
"learning_rate": 8.000800080008001e-06, |
|
"loss": 8.8275, |
|
"mean_token_accuracy": 0.5821652906313538, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 0.688, |
|
"grad_norm": 29.679323196411133, |
|
"learning_rate": 7.000700070007001e-06, |
|
"loss": 8.85, |
|
"mean_token_accuracy": 0.5809146241471171, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"grad_norm": 26.106046676635742, |
|
"learning_rate": 6.000600060006001e-06, |
|
"loss": 8.8531, |
|
"mean_token_accuracy": 0.5821815392710269, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 22.304044723510742, |
|
"learning_rate": 5.000500050005001e-06, |
|
"loss": 8.8142, |
|
"mean_token_accuracy": 0.5825774453170598, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 0.736, |
|
"grad_norm": 28.982166290283203, |
|
"learning_rate": 4.0004000400040005e-06, |
|
"loss": 8.8434, |
|
"mean_token_accuracy": 0.5813510757684708, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 0.752, |
|
"grad_norm": 27.076814651489258, |
|
"learning_rate": 3.0003000300030004e-06, |
|
"loss": 8.7994, |
|
"mean_token_accuracy": 0.5838636282868683, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 26.112808227539062, |
|
"learning_rate": 2.0002000200020003e-06, |
|
"loss": 8.8055, |
|
"mean_token_accuracy": 0.5828906665407121, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 0.784, |
|
"grad_norm": 24.94652557373047, |
|
"learning_rate": 1.0001000100010001e-06, |
|
"loss": 8.8295, |
|
"mean_token_accuracy": 0.5821196795813739, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 28.15529441833496, |
|
"learning_rate": 0.0, |
|
"loss": 8.7714, |
|
"mean_token_accuracy": 0.5843258857652545, |
|
"step": 50000 |
|
} |
|
], |
|
"logging_steps": 1000, |
|
"max_steps": 50000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.68231960576e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|