|
{ |
|
"best_metric": 0.466439425945282, |
|
"best_model_checkpoint": "/home/datawork-iot-nos/Seatizen/models/multilabel/drone/drone-DinoVdeau-from-binary-large-2024_11_14-batch-size16_freeze_probs/checkpoint-22776", |
|
"epoch": 62.0, |
|
"eval_steps": 500, |
|
"global_step": 27156, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1.0, |
|
"eval_explained_variance": 0.3631434440612793, |
|
"eval_kl_divergence": 0.421912282705307, |
|
"eval_loss": 0.4821413457393646, |
|
"eval_mae": 0.13084472715854645, |
|
"eval_rmse": 0.173090398311615, |
|
"eval_runtime": 64.2475, |
|
"eval_samples_per_second": 36.64, |
|
"eval_steps_per_second": 2.304, |
|
"learning_rate": 0.001, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 1.1415525114155252, |
|
"grad_norm": 0.4757365882396698, |
|
"learning_rate": 0.001, |
|
"loss": 0.5317, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_explained_variance": 0.3752269744873047, |
|
"eval_kl_divergence": 0.6148446202278137, |
|
"eval_loss": 0.4784533977508545, |
|
"eval_mae": 0.12629373371601105, |
|
"eval_rmse": 0.17098082602024078, |
|
"eval_runtime": 63.6833, |
|
"eval_samples_per_second": 36.964, |
|
"eval_steps_per_second": 2.324, |
|
"learning_rate": 0.001, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 2.2831050228310503, |
|
"grad_norm": 0.4254082143306732, |
|
"learning_rate": 0.001, |
|
"loss": 0.4832, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_explained_variance": 0.3838556706905365, |
|
"eval_kl_divergence": 0.48802560567855835, |
|
"eval_loss": 0.47776785492897034, |
|
"eval_mae": 0.12731628119945526, |
|
"eval_rmse": 0.16985835134983063, |
|
"eval_runtime": 62.637, |
|
"eval_samples_per_second": 37.582, |
|
"eval_steps_per_second": 2.363, |
|
"learning_rate": 0.001, |
|
"step": 1314 |
|
}, |
|
{ |
|
"epoch": 3.4246575342465753, |
|
"grad_norm": 0.2670271098613739, |
|
"learning_rate": 0.001, |
|
"loss": 0.4791, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_explained_variance": 0.38376739621162415, |
|
"eval_kl_divergence": 0.3418101370334625, |
|
"eval_loss": 0.4793245792388916, |
|
"eval_mae": 0.12901858985424042, |
|
"eval_rmse": 0.171015664935112, |
|
"eval_runtime": 62.2828, |
|
"eval_samples_per_second": 37.795, |
|
"eval_steps_per_second": 2.376, |
|
"learning_rate": 0.001, |
|
"step": 1752 |
|
}, |
|
{ |
|
"epoch": 4.566210045662101, |
|
"grad_norm": 0.20498104393482208, |
|
"learning_rate": 0.001, |
|
"loss": 0.4771, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_explained_variance": 0.40547776222229004, |
|
"eval_kl_divergence": 0.34562820196151733, |
|
"eval_loss": 0.47521594166755676, |
|
"eval_mae": 0.12799377739429474, |
|
"eval_rmse": 0.16736441850662231, |
|
"eval_runtime": 62.7606, |
|
"eval_samples_per_second": 37.508, |
|
"eval_steps_per_second": 2.358, |
|
"learning_rate": 0.001, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 5.707762557077626, |
|
"grad_norm": 0.24335043132305145, |
|
"learning_rate": 0.001, |
|
"loss": 0.4752, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_explained_variance": 0.3849389851093292, |
|
"eval_kl_divergence": 0.6402714848518372, |
|
"eval_loss": 0.478865385055542, |
|
"eval_mae": 0.12540282309055328, |
|
"eval_rmse": 0.17068879306316376, |
|
"eval_runtime": 63.4836, |
|
"eval_samples_per_second": 37.08, |
|
"eval_steps_per_second": 2.331, |
|
"learning_rate": 0.001, |
|
"step": 2628 |
|
}, |
|
{ |
|
"epoch": 6.8493150684931505, |
|
"grad_norm": 0.18768365681171417, |
|
"learning_rate": 0.001, |
|
"loss": 0.4752, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_explained_variance": 0.3788411617279053, |
|
"eval_kl_divergence": 0.5491646528244019, |
|
"eval_loss": 0.4779475927352905, |
|
"eval_mae": 0.12878474593162537, |
|
"eval_rmse": 0.17091502249240875, |
|
"eval_runtime": 63.4904, |
|
"eval_samples_per_second": 37.076, |
|
"eval_steps_per_second": 2.331, |
|
"learning_rate": 0.001, |
|
"step": 3066 |
|
}, |
|
{ |
|
"epoch": 7.9908675799086755, |
|
"grad_norm": 0.1587909311056137, |
|
"learning_rate": 0.001, |
|
"loss": 0.4735, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_explained_variance": 0.40857037901878357, |
|
"eval_kl_divergence": 0.33827269077301025, |
|
"eval_loss": 0.4756968021392822, |
|
"eval_mae": 0.12695902585983276, |
|
"eval_rmse": 0.16784566640853882, |
|
"eval_runtime": 64.8792, |
|
"eval_samples_per_second": 36.283, |
|
"eval_steps_per_second": 2.281, |
|
"learning_rate": 0.001, |
|
"step": 3504 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_explained_variance": 0.4111548960208893, |
|
"eval_kl_divergence": 0.5417521595954895, |
|
"eval_loss": 0.4731782376766205, |
|
"eval_mae": 0.12311580032110214, |
|
"eval_rmse": 0.1657222956418991, |
|
"eval_runtime": 61.0673, |
|
"eval_samples_per_second": 38.548, |
|
"eval_steps_per_second": 2.424, |
|
"learning_rate": 0.001, |
|
"step": 3942 |
|
}, |
|
{ |
|
"epoch": 9.132420091324201, |
|
"grad_norm": 0.1892658919095993, |
|
"learning_rate": 0.001, |
|
"loss": 0.4719, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_explained_variance": 0.4006313979625702, |
|
"eval_kl_divergence": 0.15472176671028137, |
|
"eval_loss": 0.4799855649471283, |
|
"eval_mae": 0.1320570707321167, |
|
"eval_rmse": 0.1722680777311325, |
|
"eval_runtime": 62.7974, |
|
"eval_samples_per_second": 37.486, |
|
"eval_steps_per_second": 2.357, |
|
"learning_rate": 0.001, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 10.273972602739725, |
|
"grad_norm": 0.20271532237529755, |
|
"learning_rate": 0.001, |
|
"loss": 0.4727, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_explained_variance": 0.41580215096473694, |
|
"eval_kl_divergence": 0.3436921238899231, |
|
"eval_loss": 0.4731641411781311, |
|
"eval_mae": 0.12562014162540436, |
|
"eval_rmse": 0.16564464569091797, |
|
"eval_runtime": 62.757, |
|
"eval_samples_per_second": 37.51, |
|
"eval_steps_per_second": 2.358, |
|
"learning_rate": 0.001, |
|
"step": 4818 |
|
}, |
|
{ |
|
"epoch": 11.415525114155251, |
|
"grad_norm": 0.15571434795856476, |
|
"learning_rate": 0.001, |
|
"loss": 0.4723, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_explained_variance": 0.401115745306015, |
|
"eval_kl_divergence": 0.2946830093860626, |
|
"eval_loss": 0.47767141461372375, |
|
"eval_mae": 0.12927968800067902, |
|
"eval_rmse": 0.17007046937942505, |
|
"eval_runtime": 62.3684, |
|
"eval_samples_per_second": 37.743, |
|
"eval_steps_per_second": 2.373, |
|
"learning_rate": 0.001, |
|
"step": 5256 |
|
}, |
|
{ |
|
"epoch": 12.557077625570777, |
|
"grad_norm": 0.16043365001678467, |
|
"learning_rate": 0.001, |
|
"loss": 0.4735, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_explained_variance": 0.39704158902168274, |
|
"eval_kl_divergence": 0.6136478781700134, |
|
"eval_loss": 0.48009705543518066, |
|
"eval_mae": 0.12475714087486267, |
|
"eval_rmse": 0.16773907840251923, |
|
"eval_runtime": 64.7917, |
|
"eval_samples_per_second": 36.332, |
|
"eval_steps_per_second": 2.284, |
|
"learning_rate": 0.001, |
|
"step": 5694 |
|
}, |
|
{ |
|
"epoch": 13.698630136986301, |
|
"grad_norm": 0.13616104423999786, |
|
"learning_rate": 0.001, |
|
"loss": 0.4728, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_explained_variance": 0.40355002880096436, |
|
"eval_kl_divergence": Infinity, |
|
"eval_loss": 0.4954195022583008, |
|
"eval_mae": 0.12534154951572418, |
|
"eval_rmse": 0.16692323982715607, |
|
"eval_runtime": 62.1801, |
|
"eval_samples_per_second": 37.858, |
|
"eval_steps_per_second": 2.38, |
|
"learning_rate": 0.001, |
|
"step": 6132 |
|
}, |
|
{ |
|
"epoch": 14.840182648401827, |
|
"grad_norm": 0.12133222818374634, |
|
"learning_rate": 0.001, |
|
"loss": 0.4713, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_explained_variance": 0.4051372706890106, |
|
"eval_kl_divergence": Infinity, |
|
"eval_loss": 0.4812238812446594, |
|
"eval_mae": 0.12540575861930847, |
|
"eval_rmse": 0.16624794900417328, |
|
"eval_runtime": 61.3206, |
|
"eval_samples_per_second": 38.388, |
|
"eval_steps_per_second": 2.414, |
|
"learning_rate": 0.001, |
|
"step": 6570 |
|
}, |
|
{ |
|
"epoch": 15.981735159817351, |
|
"grad_norm": 0.11760278791189194, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4706, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_explained_variance": 0.41243478655815125, |
|
"eval_kl_divergence": Infinity, |
|
"eval_loss": 0.4858487546443939, |
|
"eval_mae": 0.12432911992073059, |
|
"eval_rmse": 0.16562338173389435, |
|
"eval_runtime": 61.1501, |
|
"eval_samples_per_second": 38.495, |
|
"eval_steps_per_second": 2.42, |
|
"learning_rate": 0.0001, |
|
"step": 7008 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_explained_variance": 0.4291960895061493, |
|
"eval_kl_divergence": 0.41650328040122986, |
|
"eval_loss": 0.47084349393844604, |
|
"eval_mae": 0.12233477830886841, |
|
"eval_rmse": 0.162751242518425, |
|
"eval_runtime": 60.5736, |
|
"eval_samples_per_second": 38.862, |
|
"eval_steps_per_second": 2.443, |
|
"learning_rate": 0.0001, |
|
"step": 7446 |
|
}, |
|
{ |
|
"epoch": 17.123287671232877, |
|
"grad_norm": 0.13284093141555786, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4672, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_explained_variance": 0.43114474415779114, |
|
"eval_kl_divergence": 0.4066373407840729, |
|
"eval_loss": 0.4707622528076172, |
|
"eval_mae": 0.12164173275232315, |
|
"eval_rmse": 0.16261519491672516, |
|
"eval_runtime": 63.1248, |
|
"eval_samples_per_second": 37.291, |
|
"eval_steps_per_second": 2.345, |
|
"learning_rate": 0.0001, |
|
"step": 7884 |
|
}, |
|
{ |
|
"epoch": 18.264840182648403, |
|
"grad_norm": 0.11859569698572159, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4659, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_explained_variance": 0.4342735707759857, |
|
"eval_kl_divergence": 0.31854644417762756, |
|
"eval_loss": 0.47095733880996704, |
|
"eval_mae": 0.12272538989782333, |
|
"eval_rmse": 0.16323107481002808, |
|
"eval_runtime": 61.7089, |
|
"eval_samples_per_second": 38.147, |
|
"eval_steps_per_second": 2.398, |
|
"learning_rate": 0.0001, |
|
"step": 8322 |
|
}, |
|
{ |
|
"epoch": 19.40639269406393, |
|
"grad_norm": 0.16951066255569458, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4653, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_explained_variance": 0.43487218022346497, |
|
"eval_kl_divergence": 0.465139240026474, |
|
"eval_loss": 0.4696938395500183, |
|
"eval_mae": 0.12050192803144455, |
|
"eval_rmse": 0.1620241105556488, |
|
"eval_runtime": 61.0162, |
|
"eval_samples_per_second": 38.58, |
|
"eval_steps_per_second": 2.426, |
|
"learning_rate": 0.0001, |
|
"step": 8760 |
|
}, |
|
{ |
|
"epoch": 20.54794520547945, |
|
"grad_norm": 0.167369082570076, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4653, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"eval_explained_variance": 0.4389828145503998, |
|
"eval_kl_divergence": 0.3772728741168976, |
|
"eval_loss": 0.46922874450683594, |
|
"eval_mae": 0.12155676633119583, |
|
"eval_rmse": 0.16139467060565948, |
|
"eval_runtime": 62.17, |
|
"eval_samples_per_second": 37.864, |
|
"eval_steps_per_second": 2.381, |
|
"learning_rate": 0.0001, |
|
"step": 9198 |
|
}, |
|
{ |
|
"epoch": 21.689497716894977, |
|
"grad_norm": 0.1247042864561081, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4659, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_explained_variance": 0.43975934386253357, |
|
"eval_kl_divergence": 0.4611187279224396, |
|
"eval_loss": 0.4685634672641754, |
|
"eval_mae": 0.1203194335103035, |
|
"eval_rmse": 0.16088876128196716, |
|
"eval_runtime": 62.0052, |
|
"eval_samples_per_second": 37.965, |
|
"eval_steps_per_second": 2.387, |
|
"learning_rate": 0.0001, |
|
"step": 9636 |
|
}, |
|
{ |
|
"epoch": 22.831050228310502, |
|
"grad_norm": 0.16208066046237946, |
|
"learning_rate": 0.0001, |
|
"loss": 0.465, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"eval_explained_variance": 0.44279029965400696, |
|
"eval_kl_divergence": 0.24986685812473297, |
|
"eval_loss": 0.47018975019454956, |
|
"eval_mae": 0.12256480008363724, |
|
"eval_rmse": 0.16208301484584808, |
|
"eval_runtime": 61.6543, |
|
"eval_samples_per_second": 38.181, |
|
"eval_steps_per_second": 2.4, |
|
"learning_rate": 0.0001, |
|
"step": 10074 |
|
}, |
|
{ |
|
"epoch": 23.972602739726028, |
|
"grad_norm": 0.17417912185192108, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4633, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_explained_variance": 0.4367590844631195, |
|
"eval_kl_divergence": 0.3702172040939331, |
|
"eval_loss": 0.4705464243888855, |
|
"eval_mae": 0.12131566554307938, |
|
"eval_rmse": 0.16277877986431122, |
|
"eval_runtime": 62.8273, |
|
"eval_samples_per_second": 37.468, |
|
"eval_steps_per_second": 2.356, |
|
"learning_rate": 0.0001, |
|
"step": 10512 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"eval_explained_variance": 0.4433206617832184, |
|
"eval_kl_divergence": 0.5132729411125183, |
|
"eval_loss": 0.4678299129009247, |
|
"eval_mae": 0.11875440925359726, |
|
"eval_rmse": 0.16013289988040924, |
|
"eval_runtime": 61.7077, |
|
"eval_samples_per_second": 38.148, |
|
"eval_steps_per_second": 2.398, |
|
"learning_rate": 0.0001, |
|
"step": 10950 |
|
}, |
|
{ |
|
"epoch": 25.114155251141554, |
|
"grad_norm": 0.13617579638957977, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4656, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_explained_variance": 0.4423791468143463, |
|
"eval_kl_divergence": 0.5665323138237, |
|
"eval_loss": 0.46802961826324463, |
|
"eval_mae": 0.117874376475811, |
|
"eval_rmse": 0.1604483276605606, |
|
"eval_runtime": 61.9639, |
|
"eval_samples_per_second": 37.99, |
|
"eval_steps_per_second": 2.388, |
|
"learning_rate": 0.0001, |
|
"step": 11388 |
|
}, |
|
{ |
|
"epoch": 26.255707762557076, |
|
"grad_norm": 0.15818916261196136, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4629, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"eval_explained_variance": 0.4434410333633423, |
|
"eval_kl_divergence": 0.42424070835113525, |
|
"eval_loss": 0.4680938124656677, |
|
"eval_mae": 0.1199984923005104, |
|
"eval_rmse": 0.16038183867931366, |
|
"eval_runtime": 62.3144, |
|
"eval_samples_per_second": 37.776, |
|
"eval_steps_per_second": 2.375, |
|
"learning_rate": 0.0001, |
|
"step": 11826 |
|
}, |
|
{ |
|
"epoch": 27.397260273972602, |
|
"grad_norm": 0.15971983969211578, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4636, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_explained_variance": 0.44512465596199036, |
|
"eval_kl_divergence": 0.2967982292175293, |
|
"eval_loss": 0.4693257212638855, |
|
"eval_mae": 0.12149528414011002, |
|
"eval_rmse": 0.1616295725107193, |
|
"eval_runtime": 66.789, |
|
"eval_samples_per_second": 35.245, |
|
"eval_steps_per_second": 2.216, |
|
"learning_rate": 0.0001, |
|
"step": 12264 |
|
}, |
|
{ |
|
"epoch": 28.538812785388128, |
|
"grad_norm": 0.15448875725269318, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4633, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"eval_explained_variance": 0.4442717730998993, |
|
"eval_kl_divergence": 0.3924856185913086, |
|
"eval_loss": 0.46847742795944214, |
|
"eval_mae": 0.1196620985865593, |
|
"eval_rmse": 0.16072382032871246, |
|
"eval_runtime": 61.9086, |
|
"eval_samples_per_second": 38.024, |
|
"eval_steps_per_second": 2.391, |
|
"learning_rate": 0.0001, |
|
"step": 12702 |
|
}, |
|
{ |
|
"epoch": 29.680365296803654, |
|
"grad_norm": 0.15532433986663818, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4631, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"eval_explained_variance": 0.4473068416118622, |
|
"eval_kl_divergence": 0.2495478093624115, |
|
"eval_loss": 0.46944141387939453, |
|
"eval_mae": 0.12209376692771912, |
|
"eval_rmse": 0.16142255067825317, |
|
"eval_runtime": 62.4285, |
|
"eval_samples_per_second": 37.707, |
|
"eval_steps_per_second": 2.371, |
|
"learning_rate": 0.0001, |
|
"step": 13140 |
|
}, |
|
{ |
|
"epoch": 30.82191780821918, |
|
"grad_norm": 0.1961052566766739, |
|
"learning_rate": 0.0001, |
|
"loss": 0.463, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"eval_explained_variance": 0.4445982277393341, |
|
"eval_kl_divergence": 0.45099732279777527, |
|
"eval_loss": 0.4678958058357239, |
|
"eval_mae": 0.11854251474142075, |
|
"eval_rmse": 0.16011421382427216, |
|
"eval_runtime": 61.3729, |
|
"eval_samples_per_second": 38.356, |
|
"eval_steps_per_second": 2.411, |
|
"learning_rate": 0.0001, |
|
"step": 13578 |
|
}, |
|
{ |
|
"epoch": 31.963470319634702, |
|
"grad_norm": 0.3346303701400757, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4623, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"eval_explained_variance": 0.4478188455104828, |
|
"eval_kl_divergence": 0.3885524570941925, |
|
"eval_loss": 0.46778997778892517, |
|
"eval_mae": 0.11933697015047073, |
|
"eval_rmse": 0.16006481647491455, |
|
"eval_runtime": 63.8544, |
|
"eval_samples_per_second": 36.865, |
|
"eval_steps_per_second": 2.318, |
|
"learning_rate": 1e-05, |
|
"step": 14016 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"eval_explained_variance": 0.44756200909614563, |
|
"eval_kl_divergence": 0.31322383880615234, |
|
"eval_loss": 0.4686955511569977, |
|
"eval_mae": 0.1201881393790245, |
|
"eval_rmse": 0.16055406630039215, |
|
"eval_runtime": 62.7334, |
|
"eval_samples_per_second": 37.524, |
|
"eval_steps_per_second": 2.359, |
|
"learning_rate": 1e-05, |
|
"step": 14454 |
|
}, |
|
{ |
|
"epoch": 33.10502283105023, |
|
"grad_norm": 0.21087272465229034, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4621, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"eval_explained_variance": 0.4478868544101715, |
|
"eval_kl_divergence": 0.3957745432853699, |
|
"eval_loss": 0.46784329414367676, |
|
"eval_mae": 0.11951460689306259, |
|
"eval_rmse": 0.1600986272096634, |
|
"eval_runtime": 60.6174, |
|
"eval_samples_per_second": 38.834, |
|
"eval_steps_per_second": 2.442, |
|
"learning_rate": 1e-05, |
|
"step": 14892 |
|
}, |
|
{ |
|
"epoch": 34.24657534246575, |
|
"grad_norm": 0.1875353455543518, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4607, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"eval_explained_variance": 0.44849491119384766, |
|
"eval_kl_divergence": 0.45786312222480774, |
|
"eval_loss": 0.4671097695827484, |
|
"eval_mae": 0.11800643056631088, |
|
"eval_rmse": 0.15947793424129486, |
|
"eval_runtime": 61.7609, |
|
"eval_samples_per_second": 38.115, |
|
"eval_steps_per_second": 2.396, |
|
"learning_rate": 1e-05, |
|
"step": 15330 |
|
}, |
|
{ |
|
"epoch": 35.38812785388128, |
|
"grad_norm": 0.16752338409423828, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4619, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"eval_explained_variance": 0.44863569736480713, |
|
"eval_kl_divergence": 0.43913933634757996, |
|
"eval_loss": 0.46735426783561707, |
|
"eval_mae": 0.11842861026525497, |
|
"eval_rmse": 0.15950414538383484, |
|
"eval_runtime": 64.5571, |
|
"eval_samples_per_second": 36.464, |
|
"eval_steps_per_second": 2.293, |
|
"learning_rate": 1e-05, |
|
"step": 15768 |
|
}, |
|
{ |
|
"epoch": 36.529680365296805, |
|
"grad_norm": 0.15660376846790314, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4612, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 37.0, |
|
"eval_explained_variance": 0.44943228363990784, |
|
"eval_kl_divergence": 0.36332887411117554, |
|
"eval_loss": 0.468018501996994, |
|
"eval_mae": 0.11912700533866882, |
|
"eval_rmse": 0.16002707183361053, |
|
"eval_runtime": 63.2971, |
|
"eval_samples_per_second": 37.19, |
|
"eval_steps_per_second": 2.338, |
|
"learning_rate": 1e-05, |
|
"step": 16206 |
|
}, |
|
{ |
|
"epoch": 37.67123287671233, |
|
"grad_norm": 0.15865331888198853, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4625, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"eval_explained_variance": 0.45025742053985596, |
|
"eval_kl_divergence": 0.43029093742370605, |
|
"eval_loss": 0.46701580286026, |
|
"eval_mae": 0.1186341941356659, |
|
"eval_rmse": 0.15923398733139038, |
|
"eval_runtime": 63.0229, |
|
"eval_samples_per_second": 37.351, |
|
"eval_steps_per_second": 2.348, |
|
"learning_rate": 1e-05, |
|
"step": 16644 |
|
}, |
|
{ |
|
"epoch": 38.81278538812786, |
|
"grad_norm": 0.2913804352283478, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4608, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 39.0, |
|
"eval_explained_variance": 0.448851078748703, |
|
"eval_kl_divergence": 0.4562166929244995, |
|
"eval_loss": 0.4673251509666443, |
|
"eval_mae": 0.11870113760232925, |
|
"eval_rmse": 0.1596096307039261, |
|
"eval_runtime": 63.132, |
|
"eval_samples_per_second": 37.287, |
|
"eval_steps_per_second": 2.344, |
|
"learning_rate": 1e-05, |
|
"step": 17082 |
|
}, |
|
{ |
|
"epoch": 39.954337899543376, |
|
"grad_norm": 0.1813182234764099, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4614, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"eval_explained_variance": 0.449774831533432, |
|
"eval_kl_divergence": 0.40653547644615173, |
|
"eval_loss": 0.4673212468624115, |
|
"eval_mae": 0.1188703179359436, |
|
"eval_rmse": 0.15939703583717346, |
|
"eval_runtime": 65.2215, |
|
"eval_samples_per_second": 36.092, |
|
"eval_steps_per_second": 2.269, |
|
"learning_rate": 1e-05, |
|
"step": 17520 |
|
}, |
|
{ |
|
"epoch": 41.0, |
|
"eval_explained_variance": 0.4507579803466797, |
|
"eval_kl_divergence": 0.3335873782634735, |
|
"eval_loss": 0.4677547216415405, |
|
"eval_mae": 0.12059084326028824, |
|
"eval_rmse": 0.159872904419899, |
|
"eval_runtime": 65.9882, |
|
"eval_samples_per_second": 35.673, |
|
"eval_steps_per_second": 2.243, |
|
"learning_rate": 1e-05, |
|
"step": 17958 |
|
}, |
|
{ |
|
"epoch": 41.0958904109589, |
|
"grad_norm": 0.1584874391555786, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4608, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 42.0, |
|
"eval_explained_variance": 0.4486294686794281, |
|
"eval_kl_divergence": 0.5311685800552368, |
|
"eval_loss": 0.4671882390975952, |
|
"eval_mae": 0.1177595853805542, |
|
"eval_rmse": 0.15967120230197906, |
|
"eval_runtime": 65.4501, |
|
"eval_samples_per_second": 35.966, |
|
"eval_steps_per_second": 2.261, |
|
"learning_rate": 1e-05, |
|
"step": 18396 |
|
}, |
|
{ |
|
"epoch": 42.23744292237443, |
|
"grad_norm": 0.17140232026576996, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4615, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 43.0, |
|
"eval_explained_variance": 0.45157137513160706, |
|
"eval_kl_divergence": 0.3923657536506653, |
|
"eval_loss": 0.46716412901878357, |
|
"eval_mae": 0.1185157299041748, |
|
"eval_rmse": 0.1592295914888382, |
|
"eval_runtime": 64.3671, |
|
"eval_samples_per_second": 36.571, |
|
"eval_steps_per_second": 2.299, |
|
"learning_rate": 1e-05, |
|
"step": 18834 |
|
}, |
|
{ |
|
"epoch": 43.37899543378995, |
|
"grad_norm": 0.12803754210472107, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4601, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"eval_explained_variance": 0.44912728667259216, |
|
"eval_kl_divergence": 0.4258858561515808, |
|
"eval_loss": 0.4678168296813965, |
|
"eval_mae": 0.11944716423749924, |
|
"eval_rmse": 0.16020986437797546, |
|
"eval_runtime": 65.6519, |
|
"eval_samples_per_second": 35.856, |
|
"eval_steps_per_second": 2.254, |
|
"learning_rate": 1e-05, |
|
"step": 19272 |
|
}, |
|
{ |
|
"epoch": 44.52054794520548, |
|
"grad_norm": 0.12536858022212982, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.4602, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 45.0, |
|
"eval_explained_variance": 0.4489940404891968, |
|
"eval_kl_divergence": 0.5214298367500305, |
|
"eval_loss": 0.46699702739715576, |
|
"eval_mae": 0.11719372868537903, |
|
"eval_rmse": 0.15936775505542755, |
|
"eval_runtime": 64.8181, |
|
"eval_samples_per_second": 36.317, |
|
"eval_steps_per_second": 2.283, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"step": 19710 |
|
}, |
|
{ |
|
"epoch": 45.662100456621005, |
|
"grad_norm": 0.12503549456596375, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.4616, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 46.0, |
|
"eval_explained_variance": 0.45176592469215393, |
|
"eval_kl_divergence": 0.4174787700176239, |
|
"eval_loss": 0.46712958812713623, |
|
"eval_mae": 0.11880326271057129, |
|
"eval_rmse": 0.1593877524137497, |
|
"eval_runtime": 64.134, |
|
"eval_samples_per_second": 36.704, |
|
"eval_steps_per_second": 2.308, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"step": 20148 |
|
}, |
|
{ |
|
"epoch": 46.80365296803653, |
|
"grad_norm": 0.1746779829263687, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.4602, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 47.0, |
|
"eval_explained_variance": 0.4524901807308197, |
|
"eval_kl_divergence": 0.4446321427822113, |
|
"eval_loss": 0.4666382074356079, |
|
"eval_mae": 0.11884639412164688, |
|
"eval_rmse": 0.15886224806308746, |
|
"eval_runtime": 68.911, |
|
"eval_samples_per_second": 34.16, |
|
"eval_steps_per_second": 2.148, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"step": 20586 |
|
}, |
|
{ |
|
"epoch": 47.945205479452056, |
|
"grad_norm": 0.18253998458385468, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.4604, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"eval_explained_variance": 0.44860827922821045, |
|
"eval_kl_divergence": 0.5755118727684021, |
|
"eval_loss": 0.46714723110198975, |
|
"eval_mae": 0.11802936345338821, |
|
"eval_rmse": 0.15972274541854858, |
|
"eval_runtime": 68.5695, |
|
"eval_samples_per_second": 34.33, |
|
"eval_steps_per_second": 2.158, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"step": 21024 |
|
}, |
|
{ |
|
"epoch": 49.0, |
|
"eval_explained_variance": 0.4494647979736328, |
|
"eval_kl_divergence": 0.4303589463233948, |
|
"eval_loss": 0.46758702397346497, |
|
"eval_mae": 0.11922705173492432, |
|
"eval_rmse": 0.15995512902736664, |
|
"eval_runtime": 68.4997, |
|
"eval_samples_per_second": 34.365, |
|
"eval_steps_per_second": 2.161, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"step": 21462 |
|
}, |
|
{ |
|
"epoch": 49.08675799086758, |
|
"grad_norm": 0.1836538463830948, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.4606, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"eval_explained_variance": 0.4534037411212921, |
|
"eval_kl_divergence": 0.33374354243278503, |
|
"eval_loss": 0.46752068400382996, |
|
"eval_mae": 0.12040751427412033, |
|
"eval_rmse": 0.15945331752300262, |
|
"eval_runtime": 67.7842, |
|
"eval_samples_per_second": 34.728, |
|
"eval_steps_per_second": 2.183, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"step": 21900 |
|
}, |
|
{ |
|
"epoch": 50.22831050228311, |
|
"grad_norm": 0.18452928960323334, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.4598, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 51.0, |
|
"eval_explained_variance": 0.4523892402648926, |
|
"eval_kl_divergence": 0.395465224981308, |
|
"eval_loss": 0.46691644191741943, |
|
"eval_mae": 0.11809410899877548, |
|
"eval_rmse": 0.1590944528579712, |
|
"eval_runtime": 68.2629, |
|
"eval_samples_per_second": 34.484, |
|
"eval_steps_per_second": 2.168, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"step": 22338 |
|
}, |
|
{ |
|
"epoch": 51.36986301369863, |
|
"grad_norm": 0.1816985160112381, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.4602, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 52.0, |
|
"eval_explained_variance": 0.45300889015197754, |
|
"eval_kl_divergence": 0.4761090576648712, |
|
"eval_loss": 0.466439425945282, |
|
"eval_mae": 0.1174706444144249, |
|
"eval_rmse": 0.15875311195850372, |
|
"eval_runtime": 68.2396, |
|
"eval_samples_per_second": 34.496, |
|
"eval_steps_per_second": 2.169, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"step": 22776 |
|
}, |
|
{ |
|
"epoch": 52.51141552511415, |
|
"grad_norm": 0.17806819081306458, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.462, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 53.0, |
|
"eval_explained_variance": 0.45259252190589905, |
|
"eval_kl_divergence": 0.43274176120758057, |
|
"eval_loss": 0.4667709469795227, |
|
"eval_mae": 0.11889918893575668, |
|
"eval_rmse": 0.15901200473308563, |
|
"eval_runtime": 66.8799, |
|
"eval_samples_per_second": 35.197, |
|
"eval_steps_per_second": 2.213, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"step": 23214 |
|
}, |
|
{ |
|
"epoch": 53.65296803652968, |
|
"grad_norm": 0.18054644763469696, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.4604, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 54.0, |
|
"eval_explained_variance": 0.4532507658004761, |
|
"eval_kl_divergence": 0.3724806606769562, |
|
"eval_loss": 0.46701404452323914, |
|
"eval_mae": 0.11868719011545181, |
|
"eval_rmse": 0.15923155844211578, |
|
"eval_runtime": 73.556, |
|
"eval_samples_per_second": 32.003, |
|
"eval_steps_per_second": 2.012, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"step": 23652 |
|
}, |
|
{ |
|
"epoch": 54.794520547945204, |
|
"grad_norm": 0.26471829414367676, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.461, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 55.0, |
|
"eval_explained_variance": 0.45088374614715576, |
|
"eval_kl_divergence": 0.38409897685050964, |
|
"eval_loss": 0.467383474111557, |
|
"eval_mae": 0.11990005522966385, |
|
"eval_rmse": 0.1595049947500229, |
|
"eval_runtime": 70.451, |
|
"eval_samples_per_second": 33.413, |
|
"eval_steps_per_second": 2.101, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"step": 24090 |
|
}, |
|
{ |
|
"epoch": 55.93607305936073, |
|
"grad_norm": 0.2783886194229126, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.4599, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 56.0, |
|
"eval_explained_variance": 0.45115411281585693, |
|
"eval_kl_divergence": 0.3821828067302704, |
|
"eval_loss": 0.46739572286605835, |
|
"eval_mae": 0.11897724121809006, |
|
"eval_rmse": 0.15964223444461823, |
|
"eval_runtime": 69.6578, |
|
"eval_samples_per_second": 33.794, |
|
"eval_steps_per_second": 2.125, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"step": 24528 |
|
}, |
|
{ |
|
"epoch": 57.0, |
|
"eval_explained_variance": 0.4505263864994049, |
|
"eval_kl_divergence": 0.4674541652202606, |
|
"eval_loss": 0.46702033281326294, |
|
"eval_mae": 0.1185864806175232, |
|
"eval_rmse": 0.15932416915893555, |
|
"eval_runtime": 67.4689, |
|
"eval_samples_per_second": 34.89, |
|
"eval_steps_per_second": 2.194, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"step": 24966 |
|
}, |
|
{ |
|
"epoch": 57.077625570776256, |
|
"grad_norm": 0.16562320291996002, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.4594, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 58.0, |
|
"eval_explained_variance": 0.4521506726741791, |
|
"eval_kl_divergence": 0.37376847863197327, |
|
"eval_loss": 0.46735846996307373, |
|
"eval_mae": 0.11891353130340576, |
|
"eval_rmse": 0.15956538915634155, |
|
"eval_runtime": 68.6492, |
|
"eval_samples_per_second": 34.29, |
|
"eval_steps_per_second": 2.156, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"step": 25404 |
|
}, |
|
{ |
|
"epoch": 58.21917808219178, |
|
"grad_norm": 0.21171259880065918, |
|
"learning_rate": 1.0000000000000002e-07, |
|
"loss": 0.4613, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 59.0, |
|
"eval_explained_variance": 0.45357391238212585, |
|
"eval_kl_divergence": 0.4204346239566803, |
|
"eval_loss": 0.46666717529296875, |
|
"eval_mae": 0.11845538765192032, |
|
"eval_rmse": 0.1589372605085373, |
|
"eval_runtime": 69.2012, |
|
"eval_samples_per_second": 34.017, |
|
"eval_steps_per_second": 2.139, |
|
"learning_rate": 1.0000000000000002e-07, |
|
"step": 25842 |
|
}, |
|
{ |
|
"epoch": 59.36073059360731, |
|
"grad_norm": 0.1960112601518631, |
|
"learning_rate": 1.0000000000000002e-07, |
|
"loss": 0.4607, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 60.0, |
|
"eval_explained_variance": 0.4513193368911743, |
|
"eval_kl_divergence": 0.45320600271224976, |
|
"eval_loss": 0.46685320138931274, |
|
"eval_mae": 0.11779770255088806, |
|
"eval_rmse": 0.15917657315731049, |
|
"eval_runtime": 71.4331, |
|
"eval_samples_per_second": 32.954, |
|
"eval_steps_per_second": 2.072, |
|
"learning_rate": 1.0000000000000002e-07, |
|
"step": 26280 |
|
}, |
|
{ |
|
"epoch": 60.50228310502283, |
|
"grad_norm": 0.2178792506456375, |
|
"learning_rate": 1.0000000000000002e-07, |
|
"loss": 0.4613, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 61.0, |
|
"eval_explained_variance": 0.45110437273979187, |
|
"eval_kl_divergence": 0.40322577953338623, |
|
"eval_loss": 0.46734780073165894, |
|
"eval_mae": 0.11893540620803833, |
|
"eval_rmse": 0.1595635712146759, |
|
"eval_runtime": 69.3534, |
|
"eval_samples_per_second": 33.942, |
|
"eval_steps_per_second": 2.134, |
|
"learning_rate": 1.0000000000000002e-07, |
|
"step": 26718 |
|
}, |
|
{ |
|
"epoch": 61.64383561643836, |
|
"grad_norm": 0.16740958392620087, |
|
"learning_rate": 1.0000000000000002e-07, |
|
"loss": 0.4598, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 62.0, |
|
"eval_explained_variance": 0.4526772201061249, |
|
"eval_kl_divergence": 0.3406714200973511, |
|
"eval_loss": 0.4673011302947998, |
|
"eval_mae": 0.11888447403907776, |
|
"eval_rmse": 0.1594574898481369, |
|
"eval_runtime": 70.4024, |
|
"eval_samples_per_second": 33.436, |
|
"eval_steps_per_second": 2.102, |
|
"learning_rate": 1.0000000000000002e-07, |
|
"step": 27156 |
|
}, |
|
{ |
|
"epoch": 62.0, |
|
"learning_rate": 1.0000000000000002e-07, |
|
"step": 27156, |
|
"total_flos": 6.42634409963284e+19, |
|
"train_loss": 0.466335079458891, |
|
"train_runtime": 17194.6751, |
|
"train_samples_per_second": 61.092, |
|
"train_steps_per_second": 3.821 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 65700, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 150, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 10, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.42634409963284e+19, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|