{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.25157232704402516, "eval_steps": 120, "global_step": 120, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0020964360587002098, "grad_norm": 0.6940059065818787, "learning_rate": 2e-05, "loss": 1.5374, "step": 1 }, { "epoch": 0.0041928721174004195, "grad_norm": 0.7119749188423157, "learning_rate": 4e-05, "loss": 1.4412, "step": 2 }, { "epoch": 0.006289308176100629, "grad_norm": 0.6409623026847839, "learning_rate": 6e-05, "loss": 1.3945, "step": 3 }, { "epoch": 0.008385744234800839, "grad_norm": 0.7138105630874634, "learning_rate": 8e-05, "loss": 1.4504, "step": 4 }, { "epoch": 0.010482180293501049, "grad_norm": 0.8199965357780457, "learning_rate": 0.0001, "loss": 1.6434, "step": 5 }, { "epoch": 0.012578616352201259, "grad_norm": 0.7341210842132568, "learning_rate": 0.00012, "loss": 1.3584, "step": 6 }, { "epoch": 0.014675052410901468, "grad_norm": 0.8366342782974243, "learning_rate": 0.00014, "loss": 1.6133, "step": 7 }, { "epoch": 0.016771488469601678, "grad_norm": 0.45160987973213196, "learning_rate": 0.00016, "loss": 1.5444, "step": 8 }, { "epoch": 0.018867924528301886, "grad_norm": 0.564163863658905, "learning_rate": 0.00018, "loss": 1.4406, "step": 9 }, { "epoch": 0.020964360587002098, "grad_norm": 1.0345866680145264, "learning_rate": 0.0002, "loss": 1.4192, "step": 10 }, { "epoch": 0.023060796645702306, "grad_norm": 0.7354846596717834, "learning_rate": 0.0001999977372615812, "loss": 1.386, "step": 11 }, { "epoch": 0.025157232704402517, "grad_norm": 0.5225378274917603, "learning_rate": 0.00019999094914872442, "loss": 1.272, "step": 12 }, { "epoch": 0.027253668763102725, "grad_norm": 0.3142692446708679, "learning_rate": 0.0001999796359686242, "loss": 1.3781, "step": 13 }, { "epoch": 0.029350104821802937, "grad_norm": 0.33122241497039795, "learning_rate": 0.00019996379823325583, "loss": 1.5188, "step": 14 }, { "epoch": 0.031446540880503145, "grad_norm": 0.2958654761314392, "learning_rate": 0.0001999434366593524, "loss": 0.999, "step": 15 }, { "epoch": 0.033542976939203356, "grad_norm": 0.4279610812664032, "learning_rate": 0.00019991855216837224, "loss": 1.5178, "step": 16 }, { "epoch": 0.03563941299790356, "grad_norm": 0.36770907044410706, "learning_rate": 0.00019988914588645715, "loss": 1.2745, "step": 17 }, { "epoch": 0.03773584905660377, "grad_norm": 0.3166482150554657, "learning_rate": 0.00019985521914438165, "loss": 1.5023, "step": 18 }, { "epoch": 0.039832285115303984, "grad_norm": 0.42765095829963684, "learning_rate": 0.0001998167734774926, "loss": 1.2504, "step": 19 }, { "epoch": 0.041928721174004195, "grad_norm": 0.392689973115921, "learning_rate": 0.00019977381062563976, "loss": 1.2228, "step": 20 }, { "epoch": 0.0440251572327044, "grad_norm": 0.358163982629776, "learning_rate": 0.000199726332533097, "loss": 1.2634, "step": 21 }, { "epoch": 0.04612159329140461, "grad_norm": 0.3274112939834595, "learning_rate": 0.00019967434134847442, "loss": 1.4746, "step": 22 }, { "epoch": 0.04821802935010482, "grad_norm": 0.3587968945503235, "learning_rate": 0.00019961783942462104, "loss": 1.3947, "step": 23 }, { "epoch": 0.050314465408805034, "grad_norm": 0.30727654695510864, "learning_rate": 0.00019955682931851833, "loss": 1.4815, "step": 24 }, { "epoch": 0.05241090146750524, "grad_norm": 0.4096279442310333, "learning_rate": 0.00019949131379116454, "loss": 1.3225, "step": 25 }, { "epoch": 0.05450733752620545, "grad_norm": 0.36623865365982056, "learning_rate": 0.00019942129580744966, "loss": 1.3904, "step": 26 }, { "epoch": 0.05660377358490566, "grad_norm": 0.3568407893180847, "learning_rate": 0.00019934677853602133, "loss": 1.463, "step": 27 }, { "epoch": 0.05870020964360587, "grad_norm": 0.4338196814060211, "learning_rate": 0.0001992677653491414, "loss": 1.4359, "step": 28 }, { "epoch": 0.06079664570230608, "grad_norm": 0.4408683180809021, "learning_rate": 0.00019918425982253334, "loss": 1.8015, "step": 29 }, { "epoch": 0.06289308176100629, "grad_norm": 0.3609876036643982, "learning_rate": 0.00019909626573522043, "loss": 1.3589, "step": 30 }, { "epoch": 0.0649895178197065, "grad_norm": 0.43560177087783813, "learning_rate": 0.0001990037870693547, "loss": 1.734, "step": 31 }, { "epoch": 0.06708595387840671, "grad_norm": 0.37430861592292786, "learning_rate": 0.00019890682801003675, "loss": 1.3517, "step": 32 }, { "epoch": 0.06918238993710692, "grad_norm": 0.4608246386051178, "learning_rate": 0.00019880539294512637, "loss": 1.4881, "step": 33 }, { "epoch": 0.07127882599580712, "grad_norm": 0.41597816348075867, "learning_rate": 0.0001986994864650439, "loss": 1.2676, "step": 34 }, { "epoch": 0.07337526205450734, "grad_norm": 0.561418354511261, "learning_rate": 0.00019858911336256257, "loss": 1.4233, "step": 35 }, { "epoch": 0.07547169811320754, "grad_norm": 0.9351180195808411, "learning_rate": 0.00019847427863259163, "loss": 1.2086, "step": 36 }, { "epoch": 0.07756813417190776, "grad_norm": 0.6147457957267761, "learning_rate": 0.00019835498747195008, "loss": 1.4909, "step": 37 }, { "epoch": 0.07966457023060797, "grad_norm": 0.4514181315898895, "learning_rate": 0.00019823124527913185, "loss": 1.2649, "step": 38 }, { "epoch": 0.08176100628930817, "grad_norm": 0.49401888251304626, "learning_rate": 0.0001981030576540612, "loss": 1.5149, "step": 39 }, { "epoch": 0.08385744234800839, "grad_norm": 0.6095734238624573, "learning_rate": 0.00019797043039783936, "loss": 1.4917, "step": 40 }, { "epoch": 0.0859538784067086, "grad_norm": 0.42444926500320435, "learning_rate": 0.0001978333695124821, "loss": 1.3691, "step": 41 }, { "epoch": 0.0880503144654088, "grad_norm": 0.47243213653564453, "learning_rate": 0.00019769188120064812, "loss": 1.7828, "step": 42 }, { "epoch": 0.09014675052410902, "grad_norm": 0.4187338650226593, "learning_rate": 0.00019754597186535814, "loss": 1.2147, "step": 43 }, { "epoch": 0.09224318658280922, "grad_norm": 0.4433446228504181, "learning_rate": 0.0001973956481097053, "loss": 1.1449, "step": 44 }, { "epoch": 0.09433962264150944, "grad_norm": 0.5269142389297485, "learning_rate": 0.0001972409167365564, "loss": 1.4682, "step": 45 }, { "epoch": 0.09643605870020965, "grad_norm": 0.4906723201274872, "learning_rate": 0.0001970817847482439, "loss": 1.3701, "step": 46 }, { "epoch": 0.09853249475890985, "grad_norm": 0.5275290608406067, "learning_rate": 0.000196918259346249, "loss": 1.3704, "step": 47 }, { "epoch": 0.10062893081761007, "grad_norm": 0.5568628907203674, "learning_rate": 0.00019675034793087596, "loss": 1.068, "step": 48 }, { "epoch": 0.10272536687631027, "grad_norm": 0.6039868593215942, "learning_rate": 0.000196578058100917, "loss": 1.2204, "step": 49 }, { "epoch": 0.10482180293501048, "grad_norm": 0.9857679605484009, "learning_rate": 0.0001964013976533084, "loss": 1.0091, "step": 50 }, { "epoch": 0.1069182389937107, "grad_norm": 0.3437671959400177, "learning_rate": 0.00019622037458277784, "loss": 1.2225, "step": 51 }, { "epoch": 0.1090146750524109, "grad_norm": 0.3308734893798828, "learning_rate": 0.00019603499708148244, "loss": 1.2099, "step": 52 }, { "epoch": 0.1111111111111111, "grad_norm": 0.353939026594162, "learning_rate": 0.0001958452735386381, "loss": 1.2554, "step": 53 }, { "epoch": 0.11320754716981132, "grad_norm": 0.3151988089084625, "learning_rate": 0.00019565121254013979, "loss": 1.252, "step": 54 }, { "epoch": 0.11530398322851153, "grad_norm": 0.32421159744262695, "learning_rate": 0.00019545282286817303, "loss": 0.9776, "step": 55 }, { "epoch": 0.11740041928721175, "grad_norm": 0.3662404417991638, "learning_rate": 0.0001952501135008165, "loss": 1.3977, "step": 56 }, { "epoch": 0.11949685534591195, "grad_norm": 0.3840480148792267, "learning_rate": 0.00019504309361163566, "loss": 1.2663, "step": 57 }, { "epoch": 0.12159329140461216, "grad_norm": 0.37903356552124023, "learning_rate": 0.00019483177256926767, "loss": 1.5308, "step": 58 }, { "epoch": 0.12368972746331237, "grad_norm": 0.346229612827301, "learning_rate": 0.0001946161599369973, "loss": 1.4319, "step": 59 }, { "epoch": 0.12578616352201258, "grad_norm": 0.34781116247177124, "learning_rate": 0.00019439626547232433, "loss": 1.1933, "step": 60 }, { "epoch": 0.1278825995807128, "grad_norm": 0.3286825716495514, "learning_rate": 0.0001941720991265218, "loss": 1.1038, "step": 61 }, { "epoch": 0.129979035639413, "grad_norm": 0.39212745428085327, "learning_rate": 0.00019394367104418576, "loss": 1.2789, "step": 62 }, { "epoch": 0.1320754716981132, "grad_norm": 0.3172178566455841, "learning_rate": 0.0001937109915627762, "loss": 1.1614, "step": 63 }, { "epoch": 0.13417190775681342, "grad_norm": 0.371159166097641, "learning_rate": 0.00019347407121214914, "loss": 1.3819, "step": 64 }, { "epoch": 0.13626834381551362, "grad_norm": 0.36089271306991577, "learning_rate": 0.00019323292071408017, "loss": 1.4392, "step": 65 }, { "epoch": 0.13836477987421383, "grad_norm": 0.42245927453041077, "learning_rate": 0.00019298755098177926, "loss": 1.2518, "step": 66 }, { "epoch": 0.14046121593291405, "grad_norm": 0.3602246642112732, "learning_rate": 0.00019273797311939673, "loss": 1.3146, "step": 67 }, { "epoch": 0.14255765199161424, "grad_norm": 0.3581138253211975, "learning_rate": 0.00019248419842152098, "loss": 1.2622, "step": 68 }, { "epoch": 0.14465408805031446, "grad_norm": 0.391454815864563, "learning_rate": 0.0001922262383726672, "loss": 1.4421, "step": 69 }, { "epoch": 0.14675052410901468, "grad_norm": 0.4634746313095093, "learning_rate": 0.00019196410464675766, "loss": 1.3862, "step": 70 }, { "epoch": 0.1488469601677149, "grad_norm": 0.35802096128463745, "learning_rate": 0.00019169780910659333, "loss": 1.4004, "step": 71 }, { "epoch": 0.1509433962264151, "grad_norm": 0.34099411964416504, "learning_rate": 0.00019142736380331726, "loss": 1.2887, "step": 72 }, { "epoch": 0.1530398322851153, "grad_norm": 0.37205106019973755, "learning_rate": 0.00019115278097586903, "loss": 1.518, "step": 73 }, { "epoch": 0.15513626834381553, "grad_norm": 0.3985058665275574, "learning_rate": 0.00019087407305043086, "loss": 1.3483, "step": 74 }, { "epoch": 0.15723270440251572, "grad_norm": 0.3541426956653595, "learning_rate": 0.0001905912526398654, "loss": 1.3036, "step": 75 }, { "epoch": 0.15932914046121593, "grad_norm": 0.44033437967300415, "learning_rate": 0.00019030433254314474, "loss": 1.3732, "step": 76 }, { "epoch": 0.16142557651991615, "grad_norm": 0.40152212977409363, "learning_rate": 0.00019001332574477146, "loss": 1.479, "step": 77 }, { "epoch": 0.16352201257861634, "grad_norm": 0.46172958612442017, "learning_rate": 0.00018971824541419083, "loss": 1.381, "step": 78 }, { "epoch": 0.16561844863731656, "grad_norm": 0.40097662806510925, "learning_rate": 0.0001894191049051948, "loss": 1.1499, "step": 79 }, { "epoch": 0.16771488469601678, "grad_norm": 0.49080637097358704, "learning_rate": 0.0001891159177553179, "loss": 1.664, "step": 80 }, { "epoch": 0.16981132075471697, "grad_norm": 0.45318862795829773, "learning_rate": 0.00018880869768522432, "loss": 1.3287, "step": 81 }, { "epoch": 0.1719077568134172, "grad_norm": 0.4062664210796356, "learning_rate": 0.00018849745859808717, "loss": 1.2012, "step": 82 }, { "epoch": 0.1740041928721174, "grad_norm": 0.4371073246002197, "learning_rate": 0.00018818221457895926, "loss": 1.4706, "step": 83 }, { "epoch": 0.1761006289308176, "grad_norm": 0.41299256682395935, "learning_rate": 0.00018786297989413568, "loss": 1.2486, "step": 84 }, { "epoch": 0.17819706498951782, "grad_norm": 0.44734108448028564, "learning_rate": 0.00018753976899050812, "loss": 1.1505, "step": 85 }, { "epoch": 0.18029350104821804, "grad_norm": 0.552854597568512, "learning_rate": 0.00018721259649491113, "loss": 1.5622, "step": 86 }, { "epoch": 0.18238993710691823, "grad_norm": 0.541213870048523, "learning_rate": 0.0001868814772134603, "loss": 1.5055, "step": 87 }, { "epoch": 0.18448637316561844, "grad_norm": 0.48175540566444397, "learning_rate": 0.00018654642613088194, "loss": 1.2456, "step": 88 }, { "epoch": 0.18658280922431866, "grad_norm": 0.5197116732597351, "learning_rate": 0.0001862074584098352, "loss": 1.4801, "step": 89 }, { "epoch": 0.18867924528301888, "grad_norm": 0.46993735432624817, "learning_rate": 0.00018586458939022586, "loss": 1.5128, "step": 90 }, { "epoch": 0.19077568134171907, "grad_norm": 0.5093168616294861, "learning_rate": 0.00018551783458851189, "loss": 1.521, "step": 91 }, { "epoch": 0.1928721174004193, "grad_norm": 0.4279519021511078, "learning_rate": 0.0001851672096970016, "loss": 1.0692, "step": 92 }, { "epoch": 0.1949685534591195, "grad_norm": 0.48902031779289246, "learning_rate": 0.00018481273058314316, "loss": 1.3202, "step": 93 }, { "epoch": 0.1970649895178197, "grad_norm": 0.5409737229347229, "learning_rate": 0.00018445441328880682, "loss": 1.6125, "step": 94 }, { "epoch": 0.19916142557651992, "grad_norm": 0.5205714702606201, "learning_rate": 0.00018409227402955871, "loss": 1.1616, "step": 95 }, { "epoch": 0.20125786163522014, "grad_norm": 0.5157482624053955, "learning_rate": 0.00018372632919392716, "loss": 1.3375, "step": 96 }, { "epoch": 0.20335429769392033, "grad_norm": 0.5590908527374268, "learning_rate": 0.00018335659534266094, "loss": 1.6429, "step": 97 }, { "epoch": 0.20545073375262055, "grad_norm": 0.5677520036697388, "learning_rate": 0.00018298308920797985, "loss": 1.1629, "step": 98 }, { "epoch": 0.20754716981132076, "grad_norm": 0.6165626645088196, "learning_rate": 0.00018260582769281743, "loss": 1.0469, "step": 99 }, { "epoch": 0.20964360587002095, "grad_norm": 0.7722473740577698, "learning_rate": 0.0001822248278700563, "loss": 1.7717, "step": 100 }, { "epoch": 0.21174004192872117, "grad_norm": 0.34500235319137573, "learning_rate": 0.00018184010698175506, "loss": 1.0338, "step": 101 }, { "epoch": 0.2138364779874214, "grad_norm": 0.4223347008228302, "learning_rate": 0.0001814516824383685, "loss": 1.384, "step": 102 }, { "epoch": 0.21593291404612158, "grad_norm": 0.3532989025115967, "learning_rate": 0.0001810595718179593, "loss": 1.1763, "step": 103 }, { "epoch": 0.2180293501048218, "grad_norm": 0.31655967235565186, "learning_rate": 0.00018066379286540277, "loss": 1.4366, "step": 104 }, { "epoch": 0.22012578616352202, "grad_norm": 0.580037534236908, "learning_rate": 0.00018026436349158378, "loss": 1.4038, "step": 105 }, { "epoch": 0.2222222222222222, "grad_norm": 0.3317371606826782, "learning_rate": 0.00017986130177258608, "loss": 1.2701, "step": 106 }, { "epoch": 0.22431865828092243, "grad_norm": 0.34435999393463135, "learning_rate": 0.00017945462594887445, "loss": 1.2306, "step": 107 }, { "epoch": 0.22641509433962265, "grad_norm": 0.30907875299453735, "learning_rate": 0.000179044354424469, "loss": 1.0864, "step": 108 }, { "epoch": 0.22851153039832284, "grad_norm": 0.3259734511375427, "learning_rate": 0.00017863050576611265, "loss": 1.1871, "step": 109 }, { "epoch": 0.23060796645702306, "grad_norm": 0.3698357939720154, "learning_rate": 0.00017821309870243054, "loss": 1.2336, "step": 110 }, { "epoch": 0.23270440251572327, "grad_norm": 0.3339691162109375, "learning_rate": 0.00017779215212308265, "loss": 1.1696, "step": 111 }, { "epoch": 0.2348008385744235, "grad_norm": 0.333344429731369, "learning_rate": 0.0001773676850779089, "loss": 1.3809, "step": 112 }, { "epoch": 0.23689727463312368, "grad_norm": 0.35278016328811646, "learning_rate": 0.00017693971677606714, "loss": 1.3156, "step": 113 }, { "epoch": 0.2389937106918239, "grad_norm": 0.3800717890262604, "learning_rate": 0.00017650826658516375, "loss": 1.1809, "step": 114 }, { "epoch": 0.24109014675052412, "grad_norm": 0.353089302778244, "learning_rate": 0.00017607335403037712, "loss": 1.5121, "step": 115 }, { "epoch": 0.2431865828092243, "grad_norm": 0.3874945044517517, "learning_rate": 0.00017563499879357425, "loss": 1.5124, "step": 116 }, { "epoch": 0.24528301886792453, "grad_norm": 0.3635624945163727, "learning_rate": 0.00017519322071241983, "loss": 1.1454, "step": 117 }, { "epoch": 0.24737945492662475, "grad_norm": 0.39976125955581665, "learning_rate": 0.0001747480397794786, "loss": 1.4797, "step": 118 }, { "epoch": 0.24947589098532494, "grad_norm": 0.3654632866382599, "learning_rate": 0.0001742994761413105, "loss": 1.2913, "step": 119 }, { "epoch": 0.25157232704402516, "grad_norm": 0.4034808874130249, "learning_rate": 0.0001738475500975592, "loss": 1.4904, "step": 120 }, { "epoch": 0.25157232704402516, "eval_loss": 1.3252530097961426, "eval_runtime": 13.8389, "eval_samples_per_second": 14.524, "eval_steps_per_second": 7.298, "step": 120 } ], "logging_steps": 1, "max_steps": 477, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 120, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.141788609845658e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }