|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.750816104461371, |
|
"eval_steps": 230, |
|
"global_step": 690, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.001088139281828074, |
|
"grad_norm": 0.04828796908259392, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0762, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.002176278563656148, |
|
"grad_norm": 0.06207922473549843, |
|
"learning_rate": 4e-05, |
|
"loss": 0.0973, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.003264417845484222, |
|
"grad_norm": 0.06030188128352165, |
|
"learning_rate": 6e-05, |
|
"loss": 0.1003, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.004352557127312296, |
|
"grad_norm": 0.05758730694651604, |
|
"learning_rate": 8e-05, |
|
"loss": 0.085, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.00544069640914037, |
|
"grad_norm": 0.07111983001232147, |
|
"learning_rate": 0.0001, |
|
"loss": 0.1174, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.006528835690968444, |
|
"grad_norm": 0.09522929042577744, |
|
"learning_rate": 0.00012, |
|
"loss": 0.1404, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.007616974972796518, |
|
"grad_norm": 0.08600069582462311, |
|
"learning_rate": 0.00014, |
|
"loss": 0.1187, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.008705114254624592, |
|
"grad_norm": 0.0808354914188385, |
|
"learning_rate": 0.00016, |
|
"loss": 0.1235, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.009793253536452665, |
|
"grad_norm": 0.10051260888576508, |
|
"learning_rate": 0.00018, |
|
"loss": 0.135, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.01088139281828074, |
|
"grad_norm": 0.08877796679735184, |
|
"learning_rate": 0.0002, |
|
"loss": 0.0803, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.011969532100108813, |
|
"grad_norm": 0.10384293645620346, |
|
"learning_rate": 0.00019999940277008808, |
|
"loss": 0.1206, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.013057671381936888, |
|
"grad_norm": 0.1314290463924408, |
|
"learning_rate": 0.00019999761108748597, |
|
"loss": 0.1249, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.014145810663764961, |
|
"grad_norm": 0.12873488664627075, |
|
"learning_rate": 0.00019999462497359466, |
|
"loss": 0.0847, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.015233949945593036, |
|
"grad_norm": 0.12493155896663666, |
|
"learning_rate": 0.000199990444464082, |
|
"loss": 0.0858, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.01632208922742111, |
|
"grad_norm": 0.1131022647023201, |
|
"learning_rate": 0.00019998506960888256, |
|
"loss": 0.0737, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.017410228509249184, |
|
"grad_norm": 0.1214890405535698, |
|
"learning_rate": 0.0001999785004721968, |
|
"loss": 0.1151, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.018498367791077257, |
|
"grad_norm": 0.12649253010749817, |
|
"learning_rate": 0.0001999707371324904, |
|
"loss": 0.0913, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.01958650707290533, |
|
"grad_norm": 0.12470237910747528, |
|
"learning_rate": 0.00019996177968249334, |
|
"loss": 0.1043, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.020674646354733407, |
|
"grad_norm": 0.15574277937412262, |
|
"learning_rate": 0.00019995162822919883, |
|
"loss": 0.1092, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.02176278563656148, |
|
"grad_norm": 0.1281992644071579, |
|
"learning_rate": 0.0001999402828938618, |
|
"loss": 0.0892, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.022850924918389554, |
|
"grad_norm": 0.16906976699829102, |
|
"learning_rate": 0.00019992774381199778, |
|
"loss": 0.1633, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.023939064200217627, |
|
"grad_norm": 0.12387573719024658, |
|
"learning_rate": 0.00019991401113338104, |
|
"loss": 0.0872, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.025027203482045703, |
|
"grad_norm": 0.12343913316726685, |
|
"learning_rate": 0.00019989908502204292, |
|
"loss": 0.1084, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.026115342763873776, |
|
"grad_norm": 0.11399204283952713, |
|
"learning_rate": 0.00019988296565626987, |
|
"loss": 0.0824, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.02720348204570185, |
|
"grad_norm": 0.1402149796485901, |
|
"learning_rate": 0.00019986565322860115, |
|
"loss": 0.1289, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.028291621327529923, |
|
"grad_norm": 0.13711000978946686, |
|
"learning_rate": 0.00019984714794582683, |
|
"loss": 0.097, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.029379760609358, |
|
"grad_norm": 0.18544617295265198, |
|
"learning_rate": 0.000199827450028985, |
|
"loss": 0.1281, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.030467899891186073, |
|
"grad_norm": 0.15856046974658966, |
|
"learning_rate": 0.00019980655971335945, |
|
"loss": 0.1027, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.031556039173014146, |
|
"grad_norm": 0.12590578198432922, |
|
"learning_rate": 0.00019978447724847652, |
|
"loss": 0.0863, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.03264417845484222, |
|
"grad_norm": 0.14784540235996246, |
|
"learning_rate": 0.00019976120289810247, |
|
"loss": 0.1125, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03373231773667029, |
|
"grad_norm": 0.19753308594226837, |
|
"learning_rate": 0.00019973673694024, |
|
"loss": 0.1389, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.03482045701849837, |
|
"grad_norm": 0.17247086763381958, |
|
"learning_rate": 0.00019971107966712518, |
|
"loss": 0.0901, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.035908596300326445, |
|
"grad_norm": 0.23573236167430878, |
|
"learning_rate": 0.0001996842313852238, |
|
"loss": 0.1141, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.036996735582154515, |
|
"grad_norm": 0.1349520981311798, |
|
"learning_rate": 0.0001996561924152278, |
|
"loss": 0.077, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.03808487486398259, |
|
"grad_norm": 0.2573637068271637, |
|
"learning_rate": 0.00019962696309205148, |
|
"loss": 0.0965, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.03917301414581066, |
|
"grad_norm": 0.2688570022583008, |
|
"learning_rate": 0.0001995965437648273, |
|
"loss": 0.1754, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.04026115342763874, |
|
"grad_norm": 0.2946501076221466, |
|
"learning_rate": 0.0001995649347969019, |
|
"loss": 0.1893, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.041349292709466814, |
|
"grad_norm": 0.2942318320274353, |
|
"learning_rate": 0.00019953213656583168, |
|
"loss": 0.1461, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.042437431991294884, |
|
"grad_norm": 0.3316934406757355, |
|
"learning_rate": 0.00019949814946337838, |
|
"loss": 0.1072, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.04352557127312296, |
|
"grad_norm": 0.3863315284252167, |
|
"learning_rate": 0.00019946297389550433, |
|
"loss": 0.1517, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.04461371055495103, |
|
"grad_norm": 0.3102675974369049, |
|
"learning_rate": 0.00019942661028236745, |
|
"loss": 0.1421, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.04570184983677911, |
|
"grad_norm": 0.3270488679409027, |
|
"learning_rate": 0.00019938905905831654, |
|
"loss": 0.085, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.046789989118607184, |
|
"grad_norm": 0.3799861669540405, |
|
"learning_rate": 0.0001993503206718859, |
|
"loss": 0.1333, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.04787812840043525, |
|
"grad_norm": 0.45475858449935913, |
|
"learning_rate": 0.00019931039558578997, |
|
"loss": 0.1431, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.04896626768226333, |
|
"grad_norm": 0.3995339870452881, |
|
"learning_rate": 0.00019926928427691786, |
|
"loss": 0.18, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.05005440696409141, |
|
"grad_norm": 0.4547290503978729, |
|
"learning_rate": 0.00019922698723632767, |
|
"loss": 0.1578, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.051142546245919476, |
|
"grad_norm": 0.44180116057395935, |
|
"learning_rate": 0.0001991835049692405, |
|
"loss": 0.2327, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.05223068552774755, |
|
"grad_norm": 0.8059853911399841, |
|
"learning_rate": 0.0001991388379950346, |
|
"loss": 0.3089, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.05331882480957562, |
|
"grad_norm": 0.6336686015129089, |
|
"learning_rate": 0.00019909298684723904, |
|
"loss": 0.2055, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.0544069640914037, |
|
"grad_norm": 0.8279074430465698, |
|
"learning_rate": 0.00019904595207352737, |
|
"loss": 0.2626, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.055495103373231776, |
|
"grad_norm": 0.07426032423973083, |
|
"learning_rate": 0.000198997734235711, |
|
"loss": 0.0632, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.056583242655059846, |
|
"grad_norm": 0.09094005823135376, |
|
"learning_rate": 0.00019894833390973266, |
|
"loss": 0.0734, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.05767138193688792, |
|
"grad_norm": 0.09561450034379959, |
|
"learning_rate": 0.00019889775168565943, |
|
"loss": 0.0972, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.058759521218716, |
|
"grad_norm": 0.09174304455518723, |
|
"learning_rate": 0.00019884598816767563, |
|
"loss": 0.082, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.05984766050054407, |
|
"grad_norm": 0.0911480113863945, |
|
"learning_rate": 0.0001987930439740757, |
|
"loss": 0.0712, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.060935799782372145, |
|
"grad_norm": 0.1090071052312851, |
|
"learning_rate": 0.0001987389197372567, |
|
"loss": 0.09, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.062023939064200215, |
|
"grad_norm": 0.09118974953889847, |
|
"learning_rate": 0.00019868361610371097, |
|
"loss": 0.0946, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.06311207834602829, |
|
"grad_norm": 0.09903446584939957, |
|
"learning_rate": 0.0001986271337340182, |
|
"loss": 0.1074, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.06420021762785637, |
|
"grad_norm": 0.08208192884922028, |
|
"learning_rate": 0.00019856947330283752, |
|
"loss": 0.0847, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.06528835690968444, |
|
"grad_norm": 0.08504832535982132, |
|
"learning_rate": 0.0001985106354988997, |
|
"loss": 0.0852, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.06637649619151251, |
|
"grad_norm": 0.07276565581560135, |
|
"learning_rate": 0.0001984506210249986, |
|
"loss": 0.061, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.06746463547334058, |
|
"grad_norm": 0.08346979320049286, |
|
"learning_rate": 0.00019838943059798304, |
|
"loss": 0.0717, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.06855277475516866, |
|
"grad_norm": 0.09144837409257889, |
|
"learning_rate": 0.0001983270649487481, |
|
"loss": 0.0817, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.06964091403699674, |
|
"grad_norm": 0.09562050551176071, |
|
"learning_rate": 0.00019826352482222638, |
|
"loss": 0.0809, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.07072905331882481, |
|
"grad_norm": 0.10410594195127487, |
|
"learning_rate": 0.00019819881097737915, |
|
"loss": 0.0801, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.07181719260065289, |
|
"grad_norm": 0.0836932361125946, |
|
"learning_rate": 0.00019813292418718732, |
|
"loss": 0.0775, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.07290533188248095, |
|
"grad_norm": 0.09397463500499725, |
|
"learning_rate": 0.0001980658652386421, |
|
"loss": 0.065, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.07399347116430903, |
|
"grad_norm": 0.108340784907341, |
|
"learning_rate": 0.0001979976349327357, |
|
"loss": 0.0981, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.0750816104461371, |
|
"grad_norm": 0.08717814087867737, |
|
"learning_rate": 0.00019792823408445174, |
|
"loss": 0.0798, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.07616974972796518, |
|
"grad_norm": 0.1279960721731186, |
|
"learning_rate": 0.00019785766352275542, |
|
"loss": 0.0993, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.07725788900979326, |
|
"grad_norm": 0.13422514498233795, |
|
"learning_rate": 0.00019778592409058378, |
|
"loss": 0.1023, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.07834602829162132, |
|
"grad_norm": 0.10113417357206345, |
|
"learning_rate": 0.0001977130166448355, |
|
"loss": 0.0742, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.0794341675734494, |
|
"grad_norm": 0.1026310920715332, |
|
"learning_rate": 0.00019763894205636072, |
|
"loss": 0.0988, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.08052230685527748, |
|
"grad_norm": 0.09779265522956848, |
|
"learning_rate": 0.00019756370120995066, |
|
"loss": 0.0738, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.08161044613710555, |
|
"grad_norm": 0.14643464982509613, |
|
"learning_rate": 0.000197487295004327, |
|
"loss": 0.09, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.08269858541893363, |
|
"grad_norm": 0.11976644396781921, |
|
"learning_rate": 0.00019740972435213115, |
|
"loss": 0.0904, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.08378672470076169, |
|
"grad_norm": 0.10904321819543839, |
|
"learning_rate": 0.00019733099017991341, |
|
"loss": 0.0766, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.08487486398258977, |
|
"grad_norm": 0.1651339828968048, |
|
"learning_rate": 0.0001972510934281218, |
|
"loss": 0.1186, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.08596300326441784, |
|
"grad_norm": 0.11781762540340424, |
|
"learning_rate": 0.00019717003505109095, |
|
"loss": 0.0833, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.08705114254624592, |
|
"grad_norm": 0.15122370421886444, |
|
"learning_rate": 0.00019708781601703065, |
|
"loss": 0.1166, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.088139281828074, |
|
"grad_norm": 0.1798838973045349, |
|
"learning_rate": 0.00019700443730801413, |
|
"loss": 0.1109, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.08922742110990206, |
|
"grad_norm": 0.18025629222393036, |
|
"learning_rate": 0.00019691989991996663, |
|
"loss": 0.1243, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.09031556039173014, |
|
"grad_norm": 0.1731874644756317, |
|
"learning_rate": 0.00019683420486265327, |
|
"loss": 0.1189, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.09140369967355821, |
|
"grad_norm": 0.2220824509859085, |
|
"learning_rate": 0.0001967473531596671, |
|
"loss": 0.1451, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.09249183895538629, |
|
"grad_norm": 0.1664338856935501, |
|
"learning_rate": 0.00019665934584841682, |
|
"loss": 0.0897, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.09357997823721437, |
|
"grad_norm": 0.17619486153125763, |
|
"learning_rate": 0.00019657018398011434, |
|
"loss": 0.0935, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.09466811751904244, |
|
"grad_norm": 0.24987219274044037, |
|
"learning_rate": 0.00019647986861976246, |
|
"loss": 0.1955, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.0957562568008705, |
|
"grad_norm": 0.21318784356117249, |
|
"learning_rate": 0.00019638840084614182, |
|
"loss": 0.1131, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.09684439608269858, |
|
"grad_norm": 0.3128167390823364, |
|
"learning_rate": 0.0001962957817517982, |
|
"loss": 0.2011, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.09793253536452666, |
|
"grad_norm": 0.2833835184574127, |
|
"learning_rate": 0.00019620201244302952, |
|
"loss": 0.1592, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.09902067464635474, |
|
"grad_norm": 0.3286789357662201, |
|
"learning_rate": 0.00019610709403987246, |
|
"loss": 0.1602, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.10010881392818281, |
|
"grad_norm": 0.44117358326911926, |
|
"learning_rate": 0.00019601102767608923, |
|
"loss": 0.2323, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.10119695321001088, |
|
"grad_norm": 0.3795579671859741, |
|
"learning_rate": 0.00019591381449915397, |
|
"loss": 0.1867, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.10228509249183895, |
|
"grad_norm": 0.5780506730079651, |
|
"learning_rate": 0.000195815455670239, |
|
"loss": 0.1884, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.10337323177366703, |
|
"grad_norm": 0.5124024748802185, |
|
"learning_rate": 0.00019571595236420102, |
|
"loss": 0.221, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.1044613710554951, |
|
"grad_norm": 0.4628782272338867, |
|
"learning_rate": 0.00019561530576956703, |
|
"loss": 0.208, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.10554951033732318, |
|
"grad_norm": 0.3904087543487549, |
|
"learning_rate": 0.0001955135170885202, |
|
"loss": 0.2029, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.10663764961915125, |
|
"grad_norm": 0.513387143611908, |
|
"learning_rate": 0.00019541058753688538, |
|
"loss": 0.2248, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.10772578890097932, |
|
"grad_norm": 0.6133597493171692, |
|
"learning_rate": 0.00019530651834411474, |
|
"loss": 0.3751, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.1088139281828074, |
|
"grad_norm": 0.6515563726425171, |
|
"learning_rate": 0.00019520131075327298, |
|
"loss": 0.2096, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.10990206746463548, |
|
"grad_norm": 0.07718291878700256, |
|
"learning_rate": 0.00019509496602102252, |
|
"loss": 0.0631, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.11099020674646355, |
|
"grad_norm": 0.07896394282579422, |
|
"learning_rate": 0.00019498748541760846, |
|
"loss": 0.095, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.11207834602829161, |
|
"grad_norm": 0.07955438643693924, |
|
"learning_rate": 0.00019487887022684336, |
|
"loss": 0.067, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.11316648531011969, |
|
"grad_norm": 0.08391376584768295, |
|
"learning_rate": 0.0001947691217460921, |
|
"loss": 0.0637, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.11425462459194777, |
|
"grad_norm": 0.07990088313817978, |
|
"learning_rate": 0.00019465824128625617, |
|
"loss": 0.0569, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.11534276387377584, |
|
"grad_norm": 0.09000790864229202, |
|
"learning_rate": 0.00019454623017175812, |
|
"loss": 0.0639, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.11643090315560392, |
|
"grad_norm": 0.0831453874707222, |
|
"learning_rate": 0.0001944330897405257, |
|
"loss": 0.0766, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.117519042437432, |
|
"grad_norm": 0.08180610835552216, |
|
"learning_rate": 0.00019431882134397598, |
|
"loss": 0.0806, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.11860718171926006, |
|
"grad_norm": 0.07601706683635712, |
|
"learning_rate": 0.0001942034263469989, |
|
"loss": 0.0727, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.11969532100108814, |
|
"grad_norm": 0.08873546868562698, |
|
"learning_rate": 0.00019408690612794148, |
|
"loss": 0.0878, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.12078346028291621, |
|
"grad_norm": 0.10206414759159088, |
|
"learning_rate": 0.00019396926207859084, |
|
"loss": 0.1171, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.12187159956474429, |
|
"grad_norm": 0.07465587556362152, |
|
"learning_rate": 0.00019385049560415794, |
|
"loss": 0.0603, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.12295973884657237, |
|
"grad_norm": 0.09952360391616821, |
|
"learning_rate": 0.00019373060812326052, |
|
"loss": 0.0923, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.12404787812840043, |
|
"grad_norm": 0.09894778579473495, |
|
"learning_rate": 0.00019360960106790643, |
|
"loss": 0.0795, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.1251360174102285, |
|
"grad_norm": 0.09721358120441437, |
|
"learning_rate": 0.00019348747588347637, |
|
"loss": 0.1103, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.12622415669205658, |
|
"grad_norm": 0.10310002416372299, |
|
"learning_rate": 0.00019336423402870653, |
|
"loss": 0.1037, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.12731229597388466, |
|
"grad_norm": 0.10904382914304733, |
|
"learning_rate": 0.0001932398769756714, |
|
"loss": 0.1063, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.12840043525571274, |
|
"grad_norm": 0.11545544862747192, |
|
"learning_rate": 0.00019311440620976597, |
|
"loss": 0.096, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.1294885745375408, |
|
"grad_norm": 0.08674886077642441, |
|
"learning_rate": 0.00019298782322968815, |
|
"loss": 0.0779, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.1305767138193689, |
|
"grad_norm": 0.09437372535467148, |
|
"learning_rate": 0.0001928601295474208, |
|
"loss": 0.0975, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.13166485310119697, |
|
"grad_norm": 0.1258208006620407, |
|
"learning_rate": 0.00019273132668821364, |
|
"loss": 0.1277, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.13275299238302501, |
|
"grad_norm": 0.09919868409633636, |
|
"learning_rate": 0.00019260141619056507, |
|
"loss": 0.0993, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.1338411316648531, |
|
"grad_norm": 0.12028370052576065, |
|
"learning_rate": 0.0001924703996062038, |
|
"loss": 0.0751, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.13492927094668117, |
|
"grad_norm": 0.10702817142009735, |
|
"learning_rate": 0.00019233827850007027, |
|
"loss": 0.0949, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.13601741022850924, |
|
"grad_norm": 0.10939855128526688, |
|
"learning_rate": 0.000192205054450298, |
|
"loss": 0.0977, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.13710554951033732, |
|
"grad_norm": 0.11803679168224335, |
|
"learning_rate": 0.00019207072904819486, |
|
"loss": 0.0877, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.1381936887921654, |
|
"grad_norm": 0.1382649838924408, |
|
"learning_rate": 0.00019193530389822363, |
|
"loss": 0.1017, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.13928182807399347, |
|
"grad_norm": 0.1433139145374298, |
|
"learning_rate": 0.00019179878061798347, |
|
"loss": 0.0748, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.14036996735582155, |
|
"grad_norm": 0.14679527282714844, |
|
"learning_rate": 0.00019166116083819002, |
|
"loss": 0.1164, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.14145810663764963, |
|
"grad_norm": 0.13680118322372437, |
|
"learning_rate": 0.0001915224462026563, |
|
"loss": 0.1149, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.1425462459194777, |
|
"grad_norm": 0.16263848543167114, |
|
"learning_rate": 0.00019138263836827288, |
|
"loss": 0.1192, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.14363438520130578, |
|
"grad_norm": 0.16534928977489471, |
|
"learning_rate": 0.00019124173900498818, |
|
"loss": 0.1138, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.14472252448313383, |
|
"grad_norm": 0.21276706457138062, |
|
"learning_rate": 0.0001910997497957885, |
|
"loss": 0.1461, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.1458106637649619, |
|
"grad_norm": 0.2375650703907013, |
|
"learning_rate": 0.0001909566724366779, |
|
"loss": 0.2031, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.14689880304678998, |
|
"grad_norm": 0.20974405109882355, |
|
"learning_rate": 0.00019081250863665794, |
|
"loss": 0.1285, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.14798694232861806, |
|
"grad_norm": 0.308624267578125, |
|
"learning_rate": 0.00019066726011770726, |
|
"loss": 0.1717, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.14907508161044614, |
|
"grad_norm": 0.21192695200443268, |
|
"learning_rate": 0.0001905209286147611, |
|
"loss": 0.1064, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.1501632208922742, |
|
"grad_norm": 0.20596542954444885, |
|
"learning_rate": 0.0001903735158756905, |
|
"loss": 0.0975, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.1512513601741023, |
|
"grad_norm": 0.21547934412956238, |
|
"learning_rate": 0.00019022502366128135, |
|
"loss": 0.1255, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.15233949945593037, |
|
"grad_norm": 0.276815801858902, |
|
"learning_rate": 0.00019007545374521355, |
|
"loss": 0.0868, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.15342763873775844, |
|
"grad_norm": 0.37251031398773193, |
|
"learning_rate": 0.00018992480791403958, |
|
"loss": 0.1078, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.15451577801958652, |
|
"grad_norm": 0.328265517950058, |
|
"learning_rate": 0.0001897730879671634, |
|
"loss": 0.1842, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.15560391730141457, |
|
"grad_norm": 0.4400005340576172, |
|
"learning_rate": 0.00018962029571681886, |
|
"loss": 0.1857, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.15669205658324264, |
|
"grad_norm": 0.28378888964653015, |
|
"learning_rate": 0.00018946643298804793, |
|
"loss": 0.0955, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.15778019586507072, |
|
"grad_norm": 0.609008252620697, |
|
"learning_rate": 0.00018931150161867916, |
|
"loss": 0.3827, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.1588683351468988, |
|
"grad_norm": 0.3973180055618286, |
|
"learning_rate": 0.0001891555034593055, |
|
"loss": 0.1844, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.15995647442872687, |
|
"grad_norm": 0.36245423555374146, |
|
"learning_rate": 0.00018899844037326225, |
|
"loss": 0.2005, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.16104461371055495, |
|
"grad_norm": 0.5730637311935425, |
|
"learning_rate": 0.0001888403142366049, |
|
"loss": 0.2742, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.16213275299238303, |
|
"grad_norm": 0.5383718013763428, |
|
"learning_rate": 0.00018868112693808665, |
|
"loss": 0.2249, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.1632208922742111, |
|
"grad_norm": 0.9835379123687744, |
|
"learning_rate": 0.00018852088037913577, |
|
"loss": 0.344, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.16430903155603918, |
|
"grad_norm": 0.09142426401376724, |
|
"learning_rate": 0.00018835957647383303, |
|
"loss": 0.0876, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.16539717083786726, |
|
"grad_norm": 0.09199874103069305, |
|
"learning_rate": 0.00018819721714888877, |
|
"loss": 0.0798, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.16648531011969533, |
|
"grad_norm": 0.08299195021390915, |
|
"learning_rate": 0.00018803380434362, |
|
"loss": 0.0746, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.16757344940152338, |
|
"grad_norm": 0.10273315012454987, |
|
"learning_rate": 0.00018786934000992688, |
|
"loss": 0.101, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.16866158868335146, |
|
"grad_norm": 0.08524151146411896, |
|
"learning_rate": 0.00018770382611226987, |
|
"loss": 0.0684, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.16974972796517954, |
|
"grad_norm": 0.10515481233596802, |
|
"learning_rate": 0.000187537264627646, |
|
"loss": 0.0809, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.1708378672470076, |
|
"grad_norm": 0.09009183198213577, |
|
"learning_rate": 0.00018736965754556528, |
|
"loss": 0.0955, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.1719260065288357, |
|
"grad_norm": 0.10571747273206711, |
|
"learning_rate": 0.00018720100686802694, |
|
"loss": 0.0847, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.17301414581066377, |
|
"grad_norm": 0.08275768160820007, |
|
"learning_rate": 0.00018703131460949554, |
|
"loss": 0.0799, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.17410228509249184, |
|
"grad_norm": 0.08440782129764557, |
|
"learning_rate": 0.00018686058279687698, |
|
"loss": 0.0672, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.17519042437431992, |
|
"grad_norm": 0.10261505097150803, |
|
"learning_rate": 0.00018668881346949417, |
|
"loss": 0.1004, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.176278563656148, |
|
"grad_norm": 0.08730655908584595, |
|
"learning_rate": 0.00018651600867906272, |
|
"loss": 0.0711, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.17736670293797607, |
|
"grad_norm": 0.10591359436511993, |
|
"learning_rate": 0.00018634217048966637, |
|
"loss": 0.0919, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.17845484221980412, |
|
"grad_norm": 0.09251653403043747, |
|
"learning_rate": 0.0001861673009777325, |
|
"loss": 0.078, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.1795429815016322, |
|
"grad_norm": 0.106822170317173, |
|
"learning_rate": 0.00018599140223200716, |
|
"loss": 0.0895, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.18063112078346028, |
|
"grad_norm": 0.11222364753484726, |
|
"learning_rate": 0.0001858144763535302, |
|
"loss": 0.0918, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.18171926006528835, |
|
"grad_norm": 0.1363314390182495, |
|
"learning_rate": 0.00018563652545561013, |
|
"loss": 0.1126, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.18280739934711643, |
|
"grad_norm": 0.09316174685955048, |
|
"learning_rate": 0.000185457551663799, |
|
"loss": 0.0799, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.1838955386289445, |
|
"grad_norm": 0.13098089396953583, |
|
"learning_rate": 0.00018527755711586678, |
|
"loss": 0.0994, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.18498367791077258, |
|
"grad_norm": 0.11433115601539612, |
|
"learning_rate": 0.00018509654396177609, |
|
"loss": 0.1072, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.18607181719260066, |
|
"grad_norm": 0.11261814087629318, |
|
"learning_rate": 0.00018491451436365627, |
|
"loss": 0.1011, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.18715995647442873, |
|
"grad_norm": 0.1038559302687645, |
|
"learning_rate": 0.00018473147049577774, |
|
"loss": 0.0746, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.1882480957562568, |
|
"grad_norm": 0.11395396292209625, |
|
"learning_rate": 0.00018454741454452603, |
|
"loss": 0.0792, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.1893362350380849, |
|
"grad_norm": 0.13332821428775787, |
|
"learning_rate": 0.00018436234870837547, |
|
"loss": 0.1041, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.19042437431991294, |
|
"grad_norm": 0.12438289821147919, |
|
"learning_rate": 0.00018417627519786315, |
|
"loss": 0.1066, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.191512513601741, |
|
"grad_norm": 0.1353287398815155, |
|
"learning_rate": 0.00018398919623556238, |
|
"loss": 0.1193, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.1926006528835691, |
|
"grad_norm": 0.13928581774234772, |
|
"learning_rate": 0.0001838011140560562, |
|
"loss": 0.1228, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.19368879216539717, |
|
"grad_norm": 0.1474994421005249, |
|
"learning_rate": 0.00018361203090591071, |
|
"loss": 0.0812, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.19477693144722524, |
|
"grad_norm": 0.1910678595304489, |
|
"learning_rate": 0.00018342194904364813, |
|
"loss": 0.1435, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.19586507072905332, |
|
"grad_norm": 0.16526034474372864, |
|
"learning_rate": 0.00018323087073971993, |
|
"loss": 0.1136, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.1969532100108814, |
|
"grad_norm": 0.1933068335056305, |
|
"learning_rate": 0.00018303879827647975, |
|
"loss": 0.1498, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.19804134929270947, |
|
"grad_norm": 0.1647845059633255, |
|
"learning_rate": 0.00018284573394815597, |
|
"loss": 0.0764, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.19912948857453755, |
|
"grad_norm": 0.19414739310741425, |
|
"learning_rate": 0.00018265168006082437, |
|
"loss": 0.1142, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.20021762785636563, |
|
"grad_norm": 0.1872360110282898, |
|
"learning_rate": 0.00018245663893238075, |
|
"loss": 0.1169, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.20130576713819368, |
|
"grad_norm": 0.19919492304325104, |
|
"learning_rate": 0.00018226061289251298, |
|
"loss": 0.0854, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.20239390642002175, |
|
"grad_norm": 0.2233375757932663, |
|
"learning_rate": 0.00018206360428267332, |
|
"loss": 0.1271, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.20348204570184983, |
|
"grad_norm": 0.22116345167160034, |
|
"learning_rate": 0.00018186561545605054, |
|
"loss": 0.1402, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.2045701849836779, |
|
"grad_norm": 0.253864049911499, |
|
"learning_rate": 0.0001816666487775416, |
|
"loss": 0.1431, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.20565832426550598, |
|
"grad_norm": 0.2945636212825775, |
|
"learning_rate": 0.00018146670662372354, |
|
"loss": 0.1284, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.20674646354733406, |
|
"grad_norm": 0.24834126234054565, |
|
"learning_rate": 0.00018126579138282503, |
|
"loss": 0.098, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.20783460282916214, |
|
"grad_norm": 0.26815730333328247, |
|
"learning_rate": 0.00018106390545469795, |
|
"loss": 0.0877, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.2089227421109902, |
|
"grad_norm": 0.375293493270874, |
|
"learning_rate": 0.00018086105125078857, |
|
"loss": 0.1985, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.2100108813928183, |
|
"grad_norm": 0.4025906026363373, |
|
"learning_rate": 0.00018065723119410884, |
|
"loss": 0.2082, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.21109902067464636, |
|
"grad_norm": 0.3551553785800934, |
|
"learning_rate": 0.0001804524477192075, |
|
"loss": 0.2305, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.21218715995647444, |
|
"grad_norm": 0.594780445098877, |
|
"learning_rate": 0.00018024670327214084, |
|
"loss": 0.2713, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.2132752992383025, |
|
"grad_norm": 0.3940027356147766, |
|
"learning_rate": 0.0001800400003104436, |
|
"loss": 0.1623, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.21436343852013057, |
|
"grad_norm": 0.51041579246521, |
|
"learning_rate": 0.00017983234130309968, |
|
"loss": 0.2236, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.21545157780195864, |
|
"grad_norm": 0.6203753352165222, |
|
"learning_rate": 0.00017962372873051252, |
|
"loss": 0.2654, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.21653971708378672, |
|
"grad_norm": 0.7527713179588318, |
|
"learning_rate": 0.00017941416508447536, |
|
"loss": 0.2088, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.2176278563656148, |
|
"grad_norm": 1.1047406196594238, |
|
"learning_rate": 0.00017920365286814183, |
|
"loss": 0.3097, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.21871599564744287, |
|
"grad_norm": 0.0492124930024147, |
|
"learning_rate": 0.0001789921945959958, |
|
"loss": 0.0344, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.21980413492927095, |
|
"grad_norm": 0.07087790220975876, |
|
"learning_rate": 0.00017877979279382135, |
|
"loss": 0.0582, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.22089227421109903, |
|
"grad_norm": 0.07622935622930527, |
|
"learning_rate": 0.00017856644999867264, |
|
"loss": 0.062, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.2219804134929271, |
|
"grad_norm": 0.08792652189731598, |
|
"learning_rate": 0.00017835216875884368, |
|
"loss": 0.0511, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.22306855277475518, |
|
"grad_norm": 0.08028998970985413, |
|
"learning_rate": 0.0001781369516338378, |
|
"loss": 0.0665, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.22415669205658323, |
|
"grad_norm": 0.08997032046318054, |
|
"learning_rate": 0.0001779208011943371, |
|
"loss": 0.069, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.2252448313384113, |
|
"grad_norm": 0.08684886246919632, |
|
"learning_rate": 0.00017770372002217172, |
|
"loss": 0.077, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.22633297062023938, |
|
"grad_norm": 0.0965440422296524, |
|
"learning_rate": 0.000177485710710289, |
|
"loss": 0.0782, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.22742110990206746, |
|
"grad_norm": 0.09060367196798325, |
|
"learning_rate": 0.00017726677586272263, |
|
"loss": 0.066, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.22850924918389554, |
|
"grad_norm": 0.0900409147143364, |
|
"learning_rate": 0.00017704691809456143, |
|
"loss": 0.0707, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.2295973884657236, |
|
"grad_norm": 0.10733999311923981, |
|
"learning_rate": 0.00017682614003191807, |
|
"loss": 0.0916, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.2306855277475517, |
|
"grad_norm": 0.09372083842754364, |
|
"learning_rate": 0.0001766044443118978, |
|
"loss": 0.0872, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.23177366702937977, |
|
"grad_norm": 0.10344577580690384, |
|
"learning_rate": 0.00017638183358256696, |
|
"loss": 0.0903, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.23286180631120784, |
|
"grad_norm": 0.1084800437092781, |
|
"learning_rate": 0.0001761583105029213, |
|
"loss": 0.0926, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.23394994559303592, |
|
"grad_norm": 0.08565113693475723, |
|
"learning_rate": 0.00017593387774285412, |
|
"loss": 0.0758, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.235038084874864, |
|
"grad_norm": 0.11589045077562332, |
|
"learning_rate": 0.0001757085379831246, |
|
"loss": 0.0925, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.23612622415669204, |
|
"grad_norm": 0.12087468057870865, |
|
"learning_rate": 0.00017548229391532572, |
|
"loss": 0.1012, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.23721436343852012, |
|
"grad_norm": 0.1125798150897026, |
|
"learning_rate": 0.00017525514824185185, |
|
"loss": 0.109, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.2383025027203482, |
|
"grad_norm": 0.12492644041776657, |
|
"learning_rate": 0.00017502710367586687, |
|
"loss": 0.1048, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.23939064200217627, |
|
"grad_norm": 0.09837982058525085, |
|
"learning_rate": 0.00017479816294127152, |
|
"loss": 0.0803, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.24047878128400435, |
|
"grad_norm": 0.099558524787426, |
|
"learning_rate": 0.00017456832877267084, |
|
"loss": 0.0552, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.24156692056583243, |
|
"grad_norm": 0.095551498234272, |
|
"learning_rate": 0.00017433760391534167, |
|
"loss": 0.0905, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.2426550598476605, |
|
"grad_norm": 0.11664412170648575, |
|
"learning_rate": 0.0001741059911251997, |
|
"loss": 0.1005, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.24374319912948858, |
|
"grad_norm": 0.1248706802725792, |
|
"learning_rate": 0.00017387349316876666, |
|
"loss": 0.1135, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.24483133841131666, |
|
"grad_norm": 0.13133874535560608, |
|
"learning_rate": 0.0001736401128231373, |
|
"loss": 0.121, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.24591947769314473, |
|
"grad_norm": 0.12476039677858353, |
|
"learning_rate": 0.00017340585287594604, |
|
"loss": 0.1025, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.2470076169749728, |
|
"grad_norm": 0.1645650863647461, |
|
"learning_rate": 0.0001731707161253338, |
|
"loss": 0.1313, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.24809575625680086, |
|
"grad_norm": 0.1172671690583229, |
|
"learning_rate": 0.00017293470537991463, |
|
"loss": 0.0801, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.24918389553862894, |
|
"grad_norm": 0.17031441628932953, |
|
"learning_rate": 0.00017269782345874203, |
|
"loss": 0.154, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.250272034820457, |
|
"grad_norm": 0.16571593284606934, |
|
"learning_rate": 0.00017246007319127545, |
|
"loss": 0.1209, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.250272034820457, |
|
"eval_loss": 0.12318640202283859, |
|
"eval_runtime": 24.4163, |
|
"eval_samples_per_second": 15.85, |
|
"eval_steps_per_second": 7.945, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.2513601741022851, |
|
"grad_norm": 0.14655253291130066, |
|
"learning_rate": 0.00017222145741734626, |
|
"loss": 0.0879, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.25244831338411317, |
|
"grad_norm": 0.17367680370807648, |
|
"learning_rate": 0.00017198197898712404, |
|
"loss": 0.1261, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.2535364526659412, |
|
"grad_norm": 0.14948749542236328, |
|
"learning_rate": 0.0001717416407610824, |
|
"loss": 0.0874, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.2546245919477693, |
|
"grad_norm": 0.19695116579532623, |
|
"learning_rate": 0.00017150044560996488, |
|
"loss": 0.1119, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.25571273122959737, |
|
"grad_norm": 0.2416209876537323, |
|
"learning_rate": 0.00017125839641475072, |
|
"loss": 0.1495, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.25680087051142547, |
|
"grad_norm": 0.23595106601715088, |
|
"learning_rate": 0.00017101549606662024, |
|
"loss": 0.092, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.2578890097932535, |
|
"grad_norm": 0.3377005457878113, |
|
"learning_rate": 0.00017077174746692056, |
|
"loss": 0.1537, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.2589771490750816, |
|
"grad_norm": 0.31011515855789185, |
|
"learning_rate": 0.00017052715352713075, |
|
"loss": 0.2351, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.2600652883569097, |
|
"grad_norm": 0.2296973615884781, |
|
"learning_rate": 0.00017028171716882714, |
|
"loss": 0.1034, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.2611534276387378, |
|
"grad_norm": 0.33184927701950073, |
|
"learning_rate": 0.00017003544132364846, |
|
"loss": 0.1518, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.2622415669205658, |
|
"grad_norm": 0.333794504404068, |
|
"learning_rate": 0.00016978832893326074, |
|
"loss": 0.1167, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.26332970620239393, |
|
"grad_norm": 0.33567357063293457, |
|
"learning_rate": 0.00016954038294932216, |
|
"loss": 0.1672, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.264417845484222, |
|
"grad_norm": 0.3648099899291992, |
|
"learning_rate": 0.0001692916063334479, |
|
"loss": 0.1562, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.26550598476605003, |
|
"grad_norm": 0.3762454092502594, |
|
"learning_rate": 0.0001690420020571747, |
|
"loss": 0.1495, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.26659412404787813, |
|
"grad_norm": 0.42424383759498596, |
|
"learning_rate": 0.00016879157310192535, |
|
"loss": 0.1763, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.2676822633297062, |
|
"grad_norm": 0.4968826472759247, |
|
"learning_rate": 0.00016854032245897308, |
|
"loss": 0.2473, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.2687704026115343, |
|
"grad_norm": 0.5231485366821289, |
|
"learning_rate": 0.00016828825312940592, |
|
"loss": 0.2924, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.26985854189336234, |
|
"grad_norm": 0.5466935634613037, |
|
"learning_rate": 0.00016803536812409075, |
|
"loss": 0.2519, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.27094668117519044, |
|
"grad_norm": 0.6696439981460571, |
|
"learning_rate": 0.00016778167046363734, |
|
"loss": 0.2106, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.2720348204570185, |
|
"grad_norm": 0.7066907286643982, |
|
"learning_rate": 0.00016752716317836229, |
|
"loss": 0.2733, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.2731229597388466, |
|
"grad_norm": 0.058309707790613174, |
|
"learning_rate": 0.00016727184930825288, |
|
"loss": 0.0459, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.27421109902067464, |
|
"grad_norm": 0.06278934329748154, |
|
"learning_rate": 0.00016701573190293077, |
|
"loss": 0.049, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.27529923830250275, |
|
"grad_norm": 0.07942797988653183, |
|
"learning_rate": 0.00016675881402161536, |
|
"loss": 0.0757, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.2763873775843308, |
|
"grad_norm": 0.0874176099896431, |
|
"learning_rate": 0.00016650109873308765, |
|
"loss": 0.0952, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.27747551686615884, |
|
"grad_norm": 0.0788157656788826, |
|
"learning_rate": 0.0001662425891156531, |
|
"loss": 0.0655, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.27856365614798695, |
|
"grad_norm": 0.08784733712673187, |
|
"learning_rate": 0.00016598328825710533, |
|
"loss": 0.0778, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.279651795429815, |
|
"grad_norm": 0.09089700132608414, |
|
"learning_rate": 0.00016572319925468892, |
|
"loss": 0.0767, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.2807399347116431, |
|
"grad_norm": 0.07957662642002106, |
|
"learning_rate": 0.0001654623252150624, |
|
"loss": 0.0623, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.28182807399347115, |
|
"grad_norm": 0.08320681005716324, |
|
"learning_rate": 0.00016520066925426144, |
|
"loss": 0.0812, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.28291621327529926, |
|
"grad_norm": 0.10143834352493286, |
|
"learning_rate": 0.00016493823449766136, |
|
"loss": 0.0953, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.2840043525571273, |
|
"grad_norm": 0.1192905604839325, |
|
"learning_rate": 0.00016467502407993992, |
|
"loss": 0.1163, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.2850924918389554, |
|
"grad_norm": 0.11428846418857574, |
|
"learning_rate": 0.0001644110411450398, |
|
"loss": 0.1028, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.28618063112078346, |
|
"grad_norm": 0.11233223229646683, |
|
"learning_rate": 0.00016414628884613107, |
|
"loss": 0.091, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.28726877040261156, |
|
"grad_norm": 0.10367966443300247, |
|
"learning_rate": 0.00016388077034557355, |
|
"loss": 0.0797, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.2883569096844396, |
|
"grad_norm": 0.11604032665491104, |
|
"learning_rate": 0.00016361448881487914, |
|
"loss": 0.0919, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.28944504896626766, |
|
"grad_norm": 0.10309276729822159, |
|
"learning_rate": 0.00016334744743467364, |
|
"loss": 0.1065, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.29053318824809576, |
|
"grad_norm": 0.11475658416748047, |
|
"learning_rate": 0.00016307964939465914, |
|
"loss": 0.0959, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.2916213275299238, |
|
"grad_norm": 0.1230575293302536, |
|
"learning_rate": 0.0001628110978935756, |
|
"loss": 0.1031, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.2927094668117519, |
|
"grad_norm": 0.1267620474100113, |
|
"learning_rate": 0.00016254179613916278, |
|
"loss": 0.1219, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.29379760609357997, |
|
"grad_norm": 0.1032036617398262, |
|
"learning_rate": 0.000162271747348122, |
|
"loss": 0.0792, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.29488574537540807, |
|
"grad_norm": 0.10867134481668472, |
|
"learning_rate": 0.00016200095474607753, |
|
"loss": 0.0964, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.2959738846572361, |
|
"grad_norm": 0.13934585452079773, |
|
"learning_rate": 0.0001617294215675382, |
|
"loss": 0.1493, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.2970620239390642, |
|
"grad_norm": 0.1254916936159134, |
|
"learning_rate": 0.0001614571510558588, |
|
"loss": 0.1035, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.2981501632208923, |
|
"grad_norm": 0.10226383805274963, |
|
"learning_rate": 0.0001611841464632011, |
|
"loss": 0.0777, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.2992383025027203, |
|
"grad_norm": 0.11369970440864563, |
|
"learning_rate": 0.0001609104110504954, |
|
"loss": 0.0823, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.3003264417845484, |
|
"grad_norm": 0.11098276078701019, |
|
"learning_rate": 0.00016063594808740113, |
|
"loss": 0.0976, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.3014145810663765, |
|
"grad_norm": 0.13366885483264923, |
|
"learning_rate": 0.00016036076085226814, |
|
"loss": 0.1378, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.3025027203482046, |
|
"grad_norm": 0.11494230479001999, |
|
"learning_rate": 0.00016008485263209742, |
|
"loss": 0.072, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.30359085963003263, |
|
"grad_norm": 0.11145862936973572, |
|
"learning_rate": 0.0001598082267225018, |
|
"loss": 0.066, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.30467899891186073, |
|
"grad_norm": 0.1483200490474701, |
|
"learning_rate": 0.0001595308864276666, |
|
"loss": 0.1023, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.3057671381936888, |
|
"grad_norm": 0.12836772203445435, |
|
"learning_rate": 0.0001592528350603103, |
|
"loss": 0.0682, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.3068552774755169, |
|
"grad_norm": 0.16118410229682922, |
|
"learning_rate": 0.00015897407594164467, |
|
"loss": 0.1025, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.30794341675734493, |
|
"grad_norm": 0.22559022903442383, |
|
"learning_rate": 0.0001586946124013354, |
|
"loss": 0.1228, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.30903155603917304, |
|
"grad_norm": 0.233434796333313, |
|
"learning_rate": 0.0001584144477774623, |
|
"loss": 0.1928, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.3101196953210011, |
|
"grad_norm": 0.21861650049686432, |
|
"learning_rate": 0.00015813358541647915, |
|
"loss": 0.1054, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.31120783460282914, |
|
"grad_norm": 0.2723356783390045, |
|
"learning_rate": 0.00015785202867317407, |
|
"loss": 0.1411, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.31229597388465724, |
|
"grad_norm": 0.3065739870071411, |
|
"learning_rate": 0.0001575697809106292, |
|
"loss": 0.1785, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.3133841131664853, |
|
"grad_norm": 0.2983495593070984, |
|
"learning_rate": 0.00015728684550018064, |
|
"loss": 0.1402, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.3144722524483134, |
|
"grad_norm": 0.3250825107097626, |
|
"learning_rate": 0.00015700322582137827, |
|
"loss": 0.1929, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.31556039173014144, |
|
"grad_norm": 0.35388973355293274, |
|
"learning_rate": 0.00015671892526194516, |
|
"loss": 0.1791, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.31664853101196955, |
|
"grad_norm": 0.32610148191452026, |
|
"learning_rate": 0.0001564339472177373, |
|
"loss": 0.1289, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.3177366702937976, |
|
"grad_norm": 0.4028049409389496, |
|
"learning_rate": 0.0001561482950927029, |
|
"loss": 0.2026, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.3188248095756257, |
|
"grad_norm": 0.2420492172241211, |
|
"learning_rate": 0.00015586197229884184, |
|
"loss": 0.098, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.31991294885745375, |
|
"grad_norm": 0.3512971103191376, |
|
"learning_rate": 0.00015557498225616487, |
|
"loss": 0.205, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.32100108813928185, |
|
"grad_norm": 0.39271989464759827, |
|
"learning_rate": 0.00015528732839265272, |
|
"loss": 0.1473, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.3220892274211099, |
|
"grad_norm": 0.3802226185798645, |
|
"learning_rate": 0.0001549990141442153, |
|
"loss": 0.136, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.32317736670293795, |
|
"grad_norm": 0.5737869739532471, |
|
"learning_rate": 0.00015471004295465035, |
|
"loss": 0.3053, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.32426550598476606, |
|
"grad_norm": 0.45224013924598694, |
|
"learning_rate": 0.00015442041827560274, |
|
"loss": 0.222, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.3253536452665941, |
|
"grad_norm": 0.522432267665863, |
|
"learning_rate": 0.00015413014356652286, |
|
"loss": 0.1809, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.3264417845484222, |
|
"grad_norm": 0.6229780316352844, |
|
"learning_rate": 0.00015383922229462549, |
|
"loss": 0.2081, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.32752992383025026, |
|
"grad_norm": 0.053111448884010315, |
|
"learning_rate": 0.00015354765793484834, |
|
"loss": 0.0414, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.32861806311207836, |
|
"grad_norm": 0.07464036345481873, |
|
"learning_rate": 0.0001532554539698105, |
|
"loss": 0.0639, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.3297062023939064, |
|
"grad_norm": 0.08635352551937103, |
|
"learning_rate": 0.00015296261388977108, |
|
"loss": 0.074, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.3307943416757345, |
|
"grad_norm": 0.0818236917257309, |
|
"learning_rate": 0.000152669141192587, |
|
"loss": 0.0843, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.33188248095756256, |
|
"grad_norm": 0.08959626406431198, |
|
"learning_rate": 0.00015237503938367186, |
|
"loss": 0.0752, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.33297062023939067, |
|
"grad_norm": 0.087018683552742, |
|
"learning_rate": 0.00015208031197595356, |
|
"loss": 0.074, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.3340587595212187, |
|
"grad_norm": 0.10946961492300034, |
|
"learning_rate": 0.00015178496248983254, |
|
"loss": 0.0907, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.33514689880304677, |
|
"grad_norm": 0.09914237260818481, |
|
"learning_rate": 0.00015148899445313981, |
|
"loss": 0.0939, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.33623503808487487, |
|
"grad_norm": 0.07641992717981339, |
|
"learning_rate": 0.00015119241140109467, |
|
"loss": 0.0587, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.3373231773667029, |
|
"grad_norm": 0.10857319831848145, |
|
"learning_rate": 0.00015089521687626243, |
|
"loss": 0.1052, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.338411316648531, |
|
"grad_norm": 0.0868939459323883, |
|
"learning_rate": 0.0001505974144285124, |
|
"loss": 0.0723, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.3394994559303591, |
|
"grad_norm": 0.11470666527748108, |
|
"learning_rate": 0.00015029900761497506, |
|
"loss": 0.0972, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.3405875952121872, |
|
"grad_norm": 0.09828225523233414, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 0.0904, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.3416757344940152, |
|
"grad_norm": 0.09422045201063156, |
|
"learning_rate": 0.00014970039515511304, |
|
"loss": 0.0736, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.34276387377584333, |
|
"grad_norm": 0.09876245260238647, |
|
"learning_rate": 0.0001494001966589736, |
|
"loss": 0.0849, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.3438520130576714, |
|
"grad_norm": 0.1073005348443985, |
|
"learning_rate": 0.00014909940809733222, |
|
"loss": 0.0842, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.34494015233949943, |
|
"grad_norm": 0.11519600450992584, |
|
"learning_rate": 0.00014879803306298736, |
|
"loss": 0.0901, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.34602829162132753, |
|
"grad_norm": 0.10380937904119492, |
|
"learning_rate": 0.00014849607515574276, |
|
"loss": 0.0688, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.3471164309031556, |
|
"grad_norm": 0.10230353474617004, |
|
"learning_rate": 0.00014819353798236427, |
|
"loss": 0.064, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.3482045701849837, |
|
"grad_norm": 0.10846245288848877, |
|
"learning_rate": 0.00014789042515653687, |
|
"loss": 0.0815, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.34929270946681173, |
|
"grad_norm": 0.11520566791296005, |
|
"learning_rate": 0.00014758674029882152, |
|
"loss": 0.0846, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.35038084874863984, |
|
"grad_norm": 0.16834412515163422, |
|
"learning_rate": 0.00014728248703661182, |
|
"loss": 0.1249, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.3514689880304679, |
|
"grad_norm": 0.11053828150033951, |
|
"learning_rate": 0.00014697766900409074, |
|
"loss": 0.073, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.352557127312296, |
|
"grad_norm": 0.12219499796628952, |
|
"learning_rate": 0.0001466722898421873, |
|
"loss": 0.0943, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.35364526659412404, |
|
"grad_norm": 0.1294214427471161, |
|
"learning_rate": 0.00014636635319853275, |
|
"loss": 0.0761, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.35473340587595215, |
|
"grad_norm": 0.13043484091758728, |
|
"learning_rate": 0.00014605986272741748, |
|
"loss": 0.1065, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.3558215451577802, |
|
"grad_norm": 0.1078469529747963, |
|
"learning_rate": 0.00014575282208974702, |
|
"loss": 0.0718, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.35690968443960824, |
|
"grad_norm": 0.17083537578582764, |
|
"learning_rate": 0.00014544523495299842, |
|
"loss": 0.1035, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.35799782372143635, |
|
"grad_norm": 0.1370207518339157, |
|
"learning_rate": 0.00014513710499117647, |
|
"loss": 0.089, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.3590859630032644, |
|
"grad_norm": 0.1698474884033203, |
|
"learning_rate": 0.00014482843588476974, |
|
"loss": 0.1172, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.3601741022850925, |
|
"grad_norm": 0.1472265124320984, |
|
"learning_rate": 0.0001445192313207067, |
|
"loss": 0.0782, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.36126224156692055, |
|
"grad_norm": 0.153669074177742, |
|
"learning_rate": 0.00014420949499231172, |
|
"loss": 0.0844, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.36235038084874865, |
|
"grad_norm": 0.2612091600894928, |
|
"learning_rate": 0.00014389923059926062, |
|
"loss": 0.1256, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.3634385201305767, |
|
"grad_norm": 0.18867933750152588, |
|
"learning_rate": 0.00014358844184753712, |
|
"loss": 0.0822, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.3645266594124048, |
|
"grad_norm": 0.29405227303504944, |
|
"learning_rate": 0.0001432771324493879, |
|
"loss": 0.1562, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.36561479869423286, |
|
"grad_norm": 0.299411803483963, |
|
"learning_rate": 0.00014296530612327863, |
|
"loss": 0.151, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.36670293797606096, |
|
"grad_norm": 0.4020368754863739, |
|
"learning_rate": 0.00014265296659384956, |
|
"loss": 0.2564, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.367791077257889, |
|
"grad_norm": 0.27561965584754944, |
|
"learning_rate": 0.00014234011759187083, |
|
"loss": 0.1193, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.36887921653971706, |
|
"grad_norm": 0.36899837851524353, |
|
"learning_rate": 0.00014202676285419812, |
|
"loss": 0.1844, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.36996735582154516, |
|
"grad_norm": 0.3305605351924896, |
|
"learning_rate": 0.0001417129061237278, |
|
"loss": 0.0825, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.3710554951033732, |
|
"grad_norm": 0.45063266158103943, |
|
"learning_rate": 0.00014139855114935252, |
|
"loss": 0.2383, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.3721436343852013, |
|
"grad_norm": 0.319297730922699, |
|
"learning_rate": 0.0001410837016859161, |
|
"loss": 0.1054, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.37323177366702937, |
|
"grad_norm": 0.3711492121219635, |
|
"learning_rate": 0.00014076836149416887, |
|
"loss": 0.1265, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.37431991294885747, |
|
"grad_norm": 0.4362325966358185, |
|
"learning_rate": 0.0001404525343407228, |
|
"loss": 0.1818, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.3754080522306855, |
|
"grad_norm": 0.39839836955070496, |
|
"learning_rate": 0.00014013622399800627, |
|
"loss": 0.173, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.3764961915125136, |
|
"grad_norm": 0.4215060770511627, |
|
"learning_rate": 0.00013981943424421932, |
|
"loss": 0.1225, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.37758433079434167, |
|
"grad_norm": 0.4466668963432312, |
|
"learning_rate": 0.0001395021688632882, |
|
"loss": 0.1336, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.3786724700761698, |
|
"grad_norm": 0.490313321352005, |
|
"learning_rate": 0.00013918443164482046, |
|
"loss": 0.1364, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.3797606093579978, |
|
"grad_norm": 0.6865617036819458, |
|
"learning_rate": 0.00013886622638405952, |
|
"loss": 0.2864, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.3808487486398259, |
|
"grad_norm": 0.7716324925422668, |
|
"learning_rate": 0.0001385475568818394, |
|
"loss": 0.2994, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.381936887921654, |
|
"grad_norm": 0.04748038947582245, |
|
"learning_rate": 0.00013822842694453924, |
|
"loss": 0.0425, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.383025027203482, |
|
"grad_norm": 0.06222306191921234, |
|
"learning_rate": 0.00013790884038403795, |
|
"loss": 0.0519, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.38411316648531013, |
|
"grad_norm": 0.07450418174266815, |
|
"learning_rate": 0.0001375888010176686, |
|
"loss": 0.0629, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.3852013057671382, |
|
"grad_norm": 0.08733393251895905, |
|
"learning_rate": 0.00013726831266817278, |
|
"loss": 0.0701, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.3862894450489663, |
|
"grad_norm": 0.09622704982757568, |
|
"learning_rate": 0.00013694737916365517, |
|
"loss": 0.0909, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.38737758433079433, |
|
"grad_norm": 0.08062370121479034, |
|
"learning_rate": 0.00013662600433753745, |
|
"loss": 0.0722, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.38846572361262244, |
|
"grad_norm": 0.09811591356992722, |
|
"learning_rate": 0.00013630419202851284, |
|
"loss": 0.0894, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.3895538628944505, |
|
"grad_norm": 0.0917980894446373, |
|
"learning_rate": 0.0001359819460805001, |
|
"loss": 0.0816, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.3906420021762786, |
|
"grad_norm": 0.08292034268379211, |
|
"learning_rate": 0.0001356592703425976, |
|
"loss": 0.0832, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.39173014145810664, |
|
"grad_norm": 0.0940559059381485, |
|
"learning_rate": 0.00013533616866903735, |
|
"loss": 0.078, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.3928182807399347, |
|
"grad_norm": 0.09960496425628662, |
|
"learning_rate": 0.00013501264491913906, |
|
"loss": 0.0899, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.3939064200217628, |
|
"grad_norm": 0.1174091249704361, |
|
"learning_rate": 0.00013468870295726398, |
|
"loss": 0.0983, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.39499455930359084, |
|
"grad_norm": 0.1083730086684227, |
|
"learning_rate": 0.00013436434665276865, |
|
"loss": 0.1004, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.39608269858541895, |
|
"grad_norm": 0.09829343855381012, |
|
"learning_rate": 0.00013403957987995882, |
|
"loss": 0.0851, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.397170837867247, |
|
"grad_norm": 0.1172933354973793, |
|
"learning_rate": 0.00013371440651804313, |
|
"loss": 0.1033, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.3982589771490751, |
|
"grad_norm": 0.11004797369241714, |
|
"learning_rate": 0.00013338883045108674, |
|
"loss": 0.0852, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.39934711643090315, |
|
"grad_norm": 0.10466606914997101, |
|
"learning_rate": 0.00013306285556796495, |
|
"loss": 0.0893, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.40043525571273125, |
|
"grad_norm": 0.121376633644104, |
|
"learning_rate": 0.0001327364857623168, |
|
"loss": 0.1037, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.4015233949945593, |
|
"grad_norm": 0.10333437472581863, |
|
"learning_rate": 0.00013240972493249847, |
|
"loss": 0.0981, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.40261153427638735, |
|
"grad_norm": 0.1174560934305191, |
|
"learning_rate": 0.00013208257698153677, |
|
"loss": 0.1187, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.40369967355821545, |
|
"grad_norm": 0.09671124815940857, |
|
"learning_rate": 0.0001317550458170826, |
|
"loss": 0.0771, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.4047878128400435, |
|
"grad_norm": 0.11311496794223785, |
|
"learning_rate": 0.00013142713535136414, |
|
"loss": 0.0915, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.4058759521218716, |
|
"grad_norm": 0.11149045825004578, |
|
"learning_rate": 0.00013109884950114007, |
|
"loss": 0.078, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.40696409140369966, |
|
"grad_norm": 0.15049664676189423, |
|
"learning_rate": 0.00013077019218765305, |
|
"loss": 0.1008, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.40805223068552776, |
|
"grad_norm": 0.13566477596759796, |
|
"learning_rate": 0.0001304411673365826, |
|
"loss": 0.1116, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.4091403699673558, |
|
"grad_norm": 0.1317652463912964, |
|
"learning_rate": 0.00013011177887799845, |
|
"loss": 0.1068, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.4102285092491839, |
|
"grad_norm": 0.12117652595043182, |
|
"learning_rate": 0.00012978203074631334, |
|
"loss": 0.0926, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.41131664853101196, |
|
"grad_norm": 0.13246335089206696, |
|
"learning_rate": 0.00012945192688023624, |
|
"loss": 0.0867, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.41240478781284007, |
|
"grad_norm": 0.1427900642156601, |
|
"learning_rate": 0.00012912147122272523, |
|
"loss": 0.1054, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.4134929270946681, |
|
"grad_norm": 0.13975268602371216, |
|
"learning_rate": 0.0001287906677209403, |
|
"loss": 0.0993, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.41458106637649617, |
|
"grad_norm": 0.16829046607017517, |
|
"learning_rate": 0.0001284595203261965, |
|
"loss": 0.0986, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.41566920565832427, |
|
"grad_norm": 0.18288354575634003, |
|
"learning_rate": 0.00012812803299391628, |
|
"loss": 0.1164, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.4167573449401523, |
|
"grad_norm": 0.20097504556179047, |
|
"learning_rate": 0.00012779620968358273, |
|
"loss": 0.1273, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.4178454842219804, |
|
"grad_norm": 0.1646791398525238, |
|
"learning_rate": 0.00012746405435869198, |
|
"loss": 0.0833, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.41893362350380847, |
|
"grad_norm": 0.1997787058353424, |
|
"learning_rate": 0.0001271315709867059, |
|
"loss": 0.1495, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.4200217627856366, |
|
"grad_norm": 0.1489897519350052, |
|
"learning_rate": 0.00012679876353900482, |
|
"loss": 0.0756, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.4211099020674646, |
|
"grad_norm": 0.22502455115318298, |
|
"learning_rate": 0.00012646563599083996, |
|
"loss": 0.1427, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.42219804134929273, |
|
"grad_norm": 0.19359458982944489, |
|
"learning_rate": 0.00012613219232128608, |
|
"loss": 0.121, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.4232861806311208, |
|
"grad_norm": 0.244260773062706, |
|
"learning_rate": 0.0001257984365131938, |
|
"loss": 0.1469, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.4243743199129489, |
|
"grad_norm": 0.18485282361507416, |
|
"learning_rate": 0.00012546437255314222, |
|
"loss": 0.0892, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.42546245919477693, |
|
"grad_norm": 0.3717290461063385, |
|
"learning_rate": 0.00012513000443139112, |
|
"loss": 0.2099, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.426550598476605, |
|
"grad_norm": 0.28721094131469727, |
|
"learning_rate": 0.00012479533614183334, |
|
"loss": 0.1193, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.4276387377584331, |
|
"grad_norm": 0.2697299122810364, |
|
"learning_rate": 0.00012446037168194714, |
|
"loss": 0.0965, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.42872687704026113, |
|
"grad_norm": 0.32627496123313904, |
|
"learning_rate": 0.00012412511505274844, |
|
"loss": 0.1832, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.42981501632208924, |
|
"grad_norm": 0.37745073437690735, |
|
"learning_rate": 0.000123789570258743, |
|
"loss": 0.1572, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.4309031556039173, |
|
"grad_norm": 0.4901193082332611, |
|
"learning_rate": 0.00012345374130787854, |
|
"loss": 0.1873, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.4319912948857454, |
|
"grad_norm": 0.44663485884666443, |
|
"learning_rate": 0.000123117632211497, |
|
"loss": 0.3353, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.43307943416757344, |
|
"grad_norm": 0.34345391392707825, |
|
"learning_rate": 0.0001227812469842864, |
|
"loss": 0.1924, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.43416757344940154, |
|
"grad_norm": 0.5725805759429932, |
|
"learning_rate": 0.00012244458964423327, |
|
"loss": 0.2352, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.4352557127312296, |
|
"grad_norm": 0.5519152879714966, |
|
"learning_rate": 0.0001221076642125742, |
|
"loss": 0.167, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.4363438520130577, |
|
"grad_norm": 0.055198315531015396, |
|
"learning_rate": 0.00012177047471374807, |
|
"loss": 0.0472, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.43743199129488575, |
|
"grad_norm": 0.09932799637317657, |
|
"learning_rate": 0.0001214330251753481, |
|
"loss": 0.0783, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.4385201305767138, |
|
"grad_norm": 0.08226185292005539, |
|
"learning_rate": 0.00012109531962807332, |
|
"loss": 0.0656, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.4396082698585419, |
|
"grad_norm": 0.0858379453420639, |
|
"learning_rate": 0.0001207573621056809, |
|
"loss": 0.0741, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.44069640914036995, |
|
"grad_norm": 0.07838830351829529, |
|
"learning_rate": 0.00012041915664493761, |
|
"loss": 0.066, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.44178454842219805, |
|
"grad_norm": 0.08843716233968735, |
|
"learning_rate": 0.00012008070728557186, |
|
"loss": 0.0817, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.4428726877040261, |
|
"grad_norm": 0.09485173225402832, |
|
"learning_rate": 0.00011974201807022525, |
|
"loss": 0.0719, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.4439608269858542, |
|
"grad_norm": 0.12550269067287445, |
|
"learning_rate": 0.00011940309304440433, |
|
"loss": 0.1025, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.44504896626768226, |
|
"grad_norm": 0.10056477040052414, |
|
"learning_rate": 0.00011906393625643244, |
|
"loss": 0.0822, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.44613710554951036, |
|
"grad_norm": 0.08779609948396683, |
|
"learning_rate": 0.00011872455175740112, |
|
"loss": 0.0731, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.4472252448313384, |
|
"grad_norm": 0.08771763741970062, |
|
"learning_rate": 0.00011838494360112185, |
|
"loss": 0.0686, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.44831338411316646, |
|
"grad_norm": 0.09602241218090057, |
|
"learning_rate": 0.00011804511584407763, |
|
"loss": 0.0826, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.44940152339499456, |
|
"grad_norm": 0.10052221268415451, |
|
"learning_rate": 0.00011770507254537453, |
|
"loss": 0.0711, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.4504896626768226, |
|
"grad_norm": 0.08452215045690536, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 0.061, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.4515778019586507, |
|
"grad_norm": 0.09362675249576569, |
|
"learning_rate": 0.00011702435557223987, |
|
"loss": 0.0852, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.45266594124047876, |
|
"grad_norm": 0.10676004737615585, |
|
"learning_rate": 0.00011668369002869912, |
|
"loss": 0.101, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.45375408052230687, |
|
"grad_norm": 0.10523767024278641, |
|
"learning_rate": 0.00011634282520518383, |
|
"loss": 0.0892, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.4548422198041349, |
|
"grad_norm": 0.09733587503433228, |
|
"learning_rate": 0.00011600176517318741, |
|
"loss": 0.0714, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.455930359085963, |
|
"grad_norm": 0.13115546107292175, |
|
"learning_rate": 0.00011566051400653486, |
|
"loss": 0.1079, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.45701849836779107, |
|
"grad_norm": 0.10536440461874008, |
|
"learning_rate": 0.00011531907578133429, |
|
"loss": 0.0807, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.4581066376496192, |
|
"grad_norm": 0.10071249306201935, |
|
"learning_rate": 0.00011497745457592816, |
|
"loss": 0.0647, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.4591947769314472, |
|
"grad_norm": 0.12494815140962601, |
|
"learning_rate": 0.00011463565447084445, |
|
"loss": 0.0969, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.4602829162132753, |
|
"grad_norm": 0.10858377069234848, |
|
"learning_rate": 0.00011429367954874819, |
|
"loss": 0.0709, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.4613710554951034, |
|
"grad_norm": 0.10477497428655624, |
|
"learning_rate": 0.00011395153389439233, |
|
"loss": 0.0875, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.4624591947769314, |
|
"grad_norm": 0.12716920673847198, |
|
"learning_rate": 0.00011360922159456928, |
|
"loss": 0.0864, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.46354733405875953, |
|
"grad_norm": 0.13803425431251526, |
|
"learning_rate": 0.00011326674673806195, |
|
"loss": 0.1028, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.4646354733405876, |
|
"grad_norm": 0.1662827581167221, |
|
"learning_rate": 0.0001129241134155949, |
|
"loss": 0.1053, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.4657236126224157, |
|
"grad_norm": 0.13029906153678894, |
|
"learning_rate": 0.00011258132571978555, |
|
"loss": 0.0797, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.46681175190424373, |
|
"grad_norm": 0.18869560956954956, |
|
"learning_rate": 0.00011223838774509514, |
|
"loss": 0.1292, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.46789989118607184, |
|
"grad_norm": 0.14279034733772278, |
|
"learning_rate": 0.00011189530358778005, |
|
"loss": 0.0951, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.4689880304678999, |
|
"grad_norm": 0.11428643018007278, |
|
"learning_rate": 0.00011155207734584263, |
|
"loss": 0.0664, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.470076169749728, |
|
"grad_norm": 0.20008297264575958, |
|
"learning_rate": 0.00011120871311898254, |
|
"loss": 0.1027, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.47116430903155604, |
|
"grad_norm": 0.22173888981342316, |
|
"learning_rate": 0.00011086521500854745, |
|
"loss": 0.1356, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.4722524483133841, |
|
"grad_norm": 0.2382795661687851, |
|
"learning_rate": 0.00011052158711748434, |
|
"loss": 0.1516, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.4733405875952122, |
|
"grad_norm": 0.2854343354701996, |
|
"learning_rate": 0.00011017783355029026, |
|
"loss": 0.1116, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.47442872687704024, |
|
"grad_norm": 0.23063793778419495, |
|
"learning_rate": 0.00010983395841296348, |
|
"loss": 0.107, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.47551686615886835, |
|
"grad_norm": 0.19402769207954407, |
|
"learning_rate": 0.00010948996581295436, |
|
"loss": 0.0883, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.4766050054406964, |
|
"grad_norm": 0.2664678692817688, |
|
"learning_rate": 0.00010914585985911632, |
|
"loss": 0.1161, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.4776931447225245, |
|
"grad_norm": 0.29061347246170044, |
|
"learning_rate": 0.00010880164466165674, |
|
"loss": 0.1833, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.47878128400435255, |
|
"grad_norm": 0.33060985803604126, |
|
"learning_rate": 0.00010845732433208779, |
|
"loss": 0.1521, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.47986942328618065, |
|
"grad_norm": 0.28285855054855347, |
|
"learning_rate": 0.00010811290298317755, |
|
"loss": 0.1248, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.4809575625680087, |
|
"grad_norm": 0.49815383553504944, |
|
"learning_rate": 0.00010776838472890065, |
|
"loss": 0.2238, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.4820457018498368, |
|
"grad_norm": 0.367214173078537, |
|
"learning_rate": 0.00010742377368438914, |
|
"loss": 0.2344, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.48313384113166485, |
|
"grad_norm": 0.3444245159626007, |
|
"learning_rate": 0.00010707907396588361, |
|
"loss": 0.1254, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.4842219804134929, |
|
"grad_norm": 0.31096217036247253, |
|
"learning_rate": 0.00010673428969068364, |
|
"loss": 0.1313, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.485310119695321, |
|
"grad_norm": 0.5377318263053894, |
|
"learning_rate": 0.0001063894249770989, |
|
"loss": 0.2526, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.48639825897714906, |
|
"grad_norm": 0.4121945798397064, |
|
"learning_rate": 0.00010604448394439983, |
|
"loss": 0.1663, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.48748639825897716, |
|
"grad_norm": 0.48366662859916687, |
|
"learning_rate": 0.00010569947071276847, |
|
"loss": 0.2457, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.4885745375408052, |
|
"grad_norm": 0.6081061959266663, |
|
"learning_rate": 0.0001053543894032493, |
|
"loss": 0.2678, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.4896626768226333, |
|
"grad_norm": 1.0334888696670532, |
|
"learning_rate": 0.00010500924413769988, |
|
"loss": 0.3153, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.49075081610446136, |
|
"grad_norm": 0.0451701320707798, |
|
"learning_rate": 0.00010466403903874176, |
|
"loss": 0.0455, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.49183895538628947, |
|
"grad_norm": 0.05214075744152069, |
|
"learning_rate": 0.00010431877822971117, |
|
"loss": 0.0519, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.4929270946681175, |
|
"grad_norm": 0.06553710252046585, |
|
"learning_rate": 0.00010397346583460971, |
|
"loss": 0.0557, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.4940152339499456, |
|
"grad_norm": 0.06424305588006973, |
|
"learning_rate": 0.00010362810597805526, |
|
"loss": 0.0672, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.49510337323177367, |
|
"grad_norm": 0.08962132036685944, |
|
"learning_rate": 0.00010328270278523256, |
|
"loss": 0.0957, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.4961915125136017, |
|
"grad_norm": 0.08780992031097412, |
|
"learning_rate": 0.00010293726038184393, |
|
"loss": 0.0894, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.4972796517954298, |
|
"grad_norm": 0.08018220961093903, |
|
"learning_rate": 0.00010259178289406011, |
|
"loss": 0.0663, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.49836779107725787, |
|
"grad_norm": 0.07880765199661255, |
|
"learning_rate": 0.0001022462744484709, |
|
"loss": 0.0832, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.499455930359086, |
|
"grad_norm": 0.07101229578256607, |
|
"learning_rate": 0.00010190073917203589, |
|
"loss": 0.054, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.500544069640914, |
|
"grad_norm": 0.07901884615421295, |
|
"learning_rate": 0.0001015551811920351, |
|
"loss": 0.0611, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.500544069640914, |
|
"eval_loss": 0.11412899941205978, |
|
"eval_runtime": 24.4441, |
|
"eval_samples_per_second": 15.832, |
|
"eval_steps_per_second": 7.936, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.5016322089227421, |
|
"grad_norm": 0.08245149999856949, |
|
"learning_rate": 0.00010120960463601976, |
|
"loss": 0.0758, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.5027203482045702, |
|
"grad_norm": 0.07407300174236298, |
|
"learning_rate": 0.00010086401363176305, |
|
"loss": 0.0567, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.5038084874863983, |
|
"grad_norm": 0.08621735125780106, |
|
"learning_rate": 0.00010051841230721065, |
|
"loss": 0.0853, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.5048966267682263, |
|
"grad_norm": 0.08985213935375214, |
|
"learning_rate": 0.00010017280479043147, |
|
"loss": 0.0806, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.5059847660500544, |
|
"grad_norm": 0.10007097572088242, |
|
"learning_rate": 9.982719520956855e-05, |
|
"loss": 0.0943, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.5070729053318824, |
|
"grad_norm": 0.08833252638578415, |
|
"learning_rate": 9.948158769278939e-05, |
|
"loss": 0.0736, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.5081610446137106, |
|
"grad_norm": 0.08999834209680557, |
|
"learning_rate": 9.913598636823693e-05, |
|
"loss": 0.0711, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.5092491838955386, |
|
"grad_norm": 0.09894470125436783, |
|
"learning_rate": 9.879039536398024e-05, |
|
"loss": 0.0854, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.5103373231773667, |
|
"grad_norm": 0.10664485394954681, |
|
"learning_rate": 9.844481880796491e-05, |
|
"loss": 0.0817, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.5114254624591947, |
|
"grad_norm": 0.08572715520858765, |
|
"learning_rate": 9.809926082796415e-05, |
|
"loss": 0.0685, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.5125136017410229, |
|
"grad_norm": 0.1364021748304367, |
|
"learning_rate": 9.775372555152912e-05, |
|
"loss": 0.1289, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.5136017410228509, |
|
"grad_norm": 0.0975506454706192, |
|
"learning_rate": 9.740821710593989e-05, |
|
"loss": 0.0816, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.514689880304679, |
|
"grad_norm": 0.10148054361343384, |
|
"learning_rate": 9.70627396181561e-05, |
|
"loss": 0.0685, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.515778019586507, |
|
"grad_norm": 0.10723249614238739, |
|
"learning_rate": 9.671729721476746e-05, |
|
"loss": 0.1108, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.5168661588683352, |
|
"grad_norm": 0.1059638112783432, |
|
"learning_rate": 9.637189402194476e-05, |
|
"loss": 0.0797, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.5179542981501633, |
|
"grad_norm": 0.09612549841403961, |
|
"learning_rate": 9.602653416539031e-05, |
|
"loss": 0.0756, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.5190424374319913, |
|
"grad_norm": 0.12177007645368576, |
|
"learning_rate": 9.568122177028884e-05, |
|
"loss": 0.072, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.5201305767138193, |
|
"grad_norm": 0.10441110283136368, |
|
"learning_rate": 9.533596096125825e-05, |
|
"loss": 0.082, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.5212187159956474, |
|
"grad_norm": 0.1342850774526596, |
|
"learning_rate": 9.499075586230013e-05, |
|
"loss": 0.0877, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.5223068552774756, |
|
"grad_norm": 0.14177678525447845, |
|
"learning_rate": 9.464561059675073e-05, |
|
"loss": 0.1105, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.5233949945593036, |
|
"grad_norm": 0.14493241906166077, |
|
"learning_rate": 9.430052928723153e-05, |
|
"loss": 0.0975, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.5244831338411317, |
|
"grad_norm": 0.22142328321933746, |
|
"learning_rate": 9.395551605560018e-05, |
|
"loss": 0.1441, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.5255712731229597, |
|
"grad_norm": 0.13570967316627502, |
|
"learning_rate": 9.361057502290113e-05, |
|
"loss": 0.0952, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.5266594124047879, |
|
"grad_norm": 0.16124001145362854, |
|
"learning_rate": 9.326571030931637e-05, |
|
"loss": 0.0976, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.5277475516866159, |
|
"grad_norm": 0.17145387828350067, |
|
"learning_rate": 9.292092603411641e-05, |
|
"loss": 0.1024, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.528835690968444, |
|
"grad_norm": 0.23006115853786469, |
|
"learning_rate": 9.257622631561085e-05, |
|
"loss": 0.147, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.529923830250272, |
|
"grad_norm": 0.2926236391067505, |
|
"learning_rate": 9.223161527109937e-05, |
|
"loss": 0.1706, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.5310119695321001, |
|
"grad_norm": 0.2122851312160492, |
|
"learning_rate": 9.188709701682247e-05, |
|
"loss": 0.0793, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.5321001088139282, |
|
"grad_norm": 0.30783331394195557, |
|
"learning_rate": 9.154267566791223e-05, |
|
"loss": 0.1446, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.5331882480957563, |
|
"grad_norm": 0.26017701625823975, |
|
"learning_rate": 9.119835533834331e-05, |
|
"loss": 0.0925, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.5342763873775843, |
|
"grad_norm": 0.3646100163459778, |
|
"learning_rate": 9.085414014088369e-05, |
|
"loss": 0.1893, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.5353645266594124, |
|
"grad_norm": 0.3202396035194397, |
|
"learning_rate": 9.051003418704565e-05, |
|
"loss": 0.1389, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.5364526659412405, |
|
"grad_norm": 0.420622318983078, |
|
"learning_rate": 9.016604158703654e-05, |
|
"loss": 0.2336, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.5375408052230686, |
|
"grad_norm": 0.41953420639038086, |
|
"learning_rate": 8.982216644970979e-05, |
|
"loss": 0.2328, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.5386289445048966, |
|
"grad_norm": 0.39154770970344543, |
|
"learning_rate": 8.947841288251568e-05, |
|
"loss": 0.1791, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.5397170837867247, |
|
"grad_norm": 0.3094623386859894, |
|
"learning_rate": 8.913478499145254e-05, |
|
"loss": 0.1163, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.5408052230685527, |
|
"grad_norm": 0.47343355417251587, |
|
"learning_rate": 8.879128688101749e-05, |
|
"loss": 0.2118, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.5418933623503809, |
|
"grad_norm": 0.5772989988327026, |
|
"learning_rate": 8.844792265415738e-05, |
|
"loss": 0.3241, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.5429815016322089, |
|
"grad_norm": 0.574259877204895, |
|
"learning_rate": 8.810469641222001e-05, |
|
"loss": 0.1832, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.544069640914037, |
|
"grad_norm": 0.7393798828125, |
|
"learning_rate": 8.776161225490489e-05, |
|
"loss": 0.3208, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.545157780195865, |
|
"grad_norm": 0.08375679701566696, |
|
"learning_rate": 8.741867428021446e-05, |
|
"loss": 0.0647, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 0.5462459194776932, |
|
"grad_norm": 0.06409952789545059, |
|
"learning_rate": 8.707588658440511e-05, |
|
"loss": 0.0669, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.5473340587595212, |
|
"grad_norm": 0.07277576625347137, |
|
"learning_rate": 8.673325326193806e-05, |
|
"loss": 0.069, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 0.5484221980413493, |
|
"grad_norm": 0.0738697499036789, |
|
"learning_rate": 8.639077840543077e-05, |
|
"loss": 0.0729, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.5495103373231773, |
|
"grad_norm": 0.07032415270805359, |
|
"learning_rate": 8.604846610560771e-05, |
|
"loss": 0.0619, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.5505984766050055, |
|
"grad_norm": 0.08152731508016586, |
|
"learning_rate": 8.570632045125185e-05, |
|
"loss": 0.0984, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.5516866158868335, |
|
"grad_norm": 0.08058314025402069, |
|
"learning_rate": 8.536434552915556e-05, |
|
"loss": 0.0727, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 0.5527747551686616, |
|
"grad_norm": 0.08897782117128372, |
|
"learning_rate": 8.502254542407186e-05, |
|
"loss": 0.0844, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.5538628944504896, |
|
"grad_norm": 0.07130607962608337, |
|
"learning_rate": 8.468092421866573e-05, |
|
"loss": 0.0612, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 0.5549510337323177, |
|
"grad_norm": 0.09470459073781967, |
|
"learning_rate": 8.433948599346516e-05, |
|
"loss": 0.0983, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.5560391730141458, |
|
"grad_norm": 0.10217074304819107, |
|
"learning_rate": 8.399823482681262e-05, |
|
"loss": 0.1066, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 0.5571273122959739, |
|
"grad_norm": 0.08955902606248856, |
|
"learning_rate": 8.36571747948162e-05, |
|
"loss": 0.0754, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.558215451577802, |
|
"grad_norm": 0.08967861533164978, |
|
"learning_rate": 8.33163099713009e-05, |
|
"loss": 0.0745, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 0.55930359085963, |
|
"grad_norm": 0.09490852057933807, |
|
"learning_rate": 8.297564442776014e-05, |
|
"loss": 0.0956, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.5603917301414582, |
|
"grad_norm": 0.1147465854883194, |
|
"learning_rate": 8.263518223330697e-05, |
|
"loss": 0.0918, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.5614798694232862, |
|
"grad_norm": 0.10192292928695679, |
|
"learning_rate": 8.22949274546255e-05, |
|
"loss": 0.0865, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.5625680087051143, |
|
"grad_norm": 0.09790289402008057, |
|
"learning_rate": 8.195488415592238e-05, |
|
"loss": 0.0884, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 0.5636561479869423, |
|
"grad_norm": 0.10624364018440247, |
|
"learning_rate": 8.161505639887817e-05, |
|
"loss": 0.0743, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.5647442872687704, |
|
"grad_norm": 0.10007826238870621, |
|
"learning_rate": 8.127544824259889e-05, |
|
"loss": 0.077, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 0.5658324265505985, |
|
"grad_norm": 0.10816752165555954, |
|
"learning_rate": 8.093606374356759e-05, |
|
"loss": 0.0967, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.5669205658324266, |
|
"grad_norm": 0.09151450544595718, |
|
"learning_rate": 8.059690695559568e-05, |
|
"loss": 0.0559, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 0.5680087051142546, |
|
"grad_norm": 0.09337753057479858, |
|
"learning_rate": 8.025798192977481e-05, |
|
"loss": 0.0666, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.5690968443960827, |
|
"grad_norm": 0.07552886009216309, |
|
"learning_rate": 7.991929271442817e-05, |
|
"loss": 0.0427, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 0.5701849836779108, |
|
"grad_norm": 0.125149667263031, |
|
"learning_rate": 7.958084335506239e-05, |
|
"loss": 0.0886, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.5712731229597389, |
|
"grad_norm": 0.1259811669588089, |
|
"learning_rate": 7.924263789431912e-05, |
|
"loss": 0.0836, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.5723612622415669, |
|
"grad_norm": 0.11561718583106995, |
|
"learning_rate": 7.89046803719267e-05, |
|
"loss": 0.0767, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.573449401523395, |
|
"grad_norm": 0.15881969034671783, |
|
"learning_rate": 7.856697482465196e-05, |
|
"loss": 0.132, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 0.5745375408052231, |
|
"grad_norm": 0.11481433361768723, |
|
"learning_rate": 7.822952528625191e-05, |
|
"loss": 0.0772, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.5756256800870512, |
|
"grad_norm": 0.133880615234375, |
|
"learning_rate": 7.789233578742582e-05, |
|
"loss": 0.0851, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 0.5767138193688792, |
|
"grad_norm": 0.1513182371854782, |
|
"learning_rate": 7.755541035576677e-05, |
|
"loss": 0.1258, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.5778019586507073, |
|
"grad_norm": 0.1432909518480301, |
|
"learning_rate": 7.721875301571359e-05, |
|
"loss": 0.1227, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 0.5788900979325353, |
|
"grad_norm": 0.11642878502607346, |
|
"learning_rate": 7.688236778850306e-05, |
|
"loss": 0.0686, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.5799782372143635, |
|
"grad_norm": 0.16764387488365173, |
|
"learning_rate": 7.654625869212146e-05, |
|
"loss": 0.1262, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 0.5810663764961915, |
|
"grad_norm": 0.11564290523529053, |
|
"learning_rate": 7.6210429741257e-05, |
|
"loss": 0.0572, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.5821545157780196, |
|
"grad_norm": 0.2554946541786194, |
|
"learning_rate": 7.587488494725157e-05, |
|
"loss": 0.1812, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.5832426550598476, |
|
"grad_norm": 0.1616709679365158, |
|
"learning_rate": 7.55396283180529e-05, |
|
"loss": 0.0818, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.5843307943416758, |
|
"grad_norm": 0.321591317653656, |
|
"learning_rate": 7.520466385816671e-05, |
|
"loss": 0.2124, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 0.5854189336235038, |
|
"grad_norm": 0.21219317615032196, |
|
"learning_rate": 7.48699955686089e-05, |
|
"loss": 0.1097, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 0.5865070729053319, |
|
"grad_norm": 0.19242917001247406, |
|
"learning_rate": 7.453562744685778e-05, |
|
"loss": 0.0769, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 0.5875952121871599, |
|
"grad_norm": 0.27252575755119324, |
|
"learning_rate": 7.42015634868062e-05, |
|
"loss": 0.123, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.588683351468988, |
|
"grad_norm": 0.3768553137779236, |
|
"learning_rate": 7.386780767871397e-05, |
|
"loss": 0.1882, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 0.5897714907508161, |
|
"grad_norm": 0.2910580337047577, |
|
"learning_rate": 7.353436400916004e-05, |
|
"loss": 0.11, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 0.5908596300326442, |
|
"grad_norm": 0.5426351428031921, |
|
"learning_rate": 7.320123646099519e-05, |
|
"loss": 0.2963, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 0.5919477693144722, |
|
"grad_norm": 0.47717493772506714, |
|
"learning_rate": 7.286842901329412e-05, |
|
"loss": 0.1808, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.5930359085963003, |
|
"grad_norm": 0.3526507019996643, |
|
"learning_rate": 7.253594564130804e-05, |
|
"loss": 0.1664, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.5941240478781284, |
|
"grad_norm": 0.4166988134384155, |
|
"learning_rate": 7.22037903164173e-05, |
|
"loss": 0.199, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.5952121871599565, |
|
"grad_norm": 0.3085844814777374, |
|
"learning_rate": 7.187196700608373e-05, |
|
"loss": 0.1008, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 0.5963003264417845, |
|
"grad_norm": 0.6430099606513977, |
|
"learning_rate": 7.154047967380354e-05, |
|
"loss": 0.2083, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 0.5973884657236126, |
|
"grad_norm": 0.5886674523353577, |
|
"learning_rate": 7.12093322790597e-05, |
|
"loss": 0.2135, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 0.5984766050054406, |
|
"grad_norm": 0.6182297468185425, |
|
"learning_rate": 7.087852877727481e-05, |
|
"loss": 0.1715, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.5995647442872688, |
|
"grad_norm": 0.052968356758356094, |
|
"learning_rate": 7.054807311976379e-05, |
|
"loss": 0.0482, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 0.6006528835690969, |
|
"grad_norm": 0.05972611904144287, |
|
"learning_rate": 7.021796925368667e-05, |
|
"loss": 0.0518, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.6017410228509249, |
|
"grad_norm": 0.06913693249225616, |
|
"learning_rate": 6.988822112200156e-05, |
|
"loss": 0.067, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 0.602829162132753, |
|
"grad_norm": 0.07957234978675842, |
|
"learning_rate": 6.955883266341741e-05, |
|
"loss": 0.0881, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 0.6039173014145811, |
|
"grad_norm": 0.07048328220844269, |
|
"learning_rate": 6.922980781234699e-05, |
|
"loss": 0.073, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.6050054406964092, |
|
"grad_norm": 0.07528451830148697, |
|
"learning_rate": 6.890115049885994e-05, |
|
"loss": 0.0777, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 0.6060935799782372, |
|
"grad_norm": 0.07747096568346024, |
|
"learning_rate": 6.85728646486359e-05, |
|
"loss": 0.0758, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 0.6071817192600653, |
|
"grad_norm": 0.10643380880355835, |
|
"learning_rate": 6.82449541829174e-05, |
|
"loss": 0.1274, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 0.6082698585418934, |
|
"grad_norm": 0.08175136148929596, |
|
"learning_rate": 6.791742301846326e-05, |
|
"loss": 0.0838, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 0.6093579978237215, |
|
"grad_norm": 0.09339357912540436, |
|
"learning_rate": 6.759027506750158e-05, |
|
"loss": 0.0767, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.6104461371055495, |
|
"grad_norm": 0.07980689406394958, |
|
"learning_rate": 6.726351423768322e-05, |
|
"loss": 0.0645, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 0.6115342763873776, |
|
"grad_norm": 0.09120305627584457, |
|
"learning_rate": 6.693714443203507e-05, |
|
"loss": 0.0763, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 0.6126224156692056, |
|
"grad_norm": 0.09666004031896591, |
|
"learning_rate": 6.661116954891328e-05, |
|
"loss": 0.0749, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 0.6137105549510338, |
|
"grad_norm": 0.09731943160295486, |
|
"learning_rate": 6.62855934819569e-05, |
|
"loss": 0.0891, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 0.6147986942328618, |
|
"grad_norm": 0.09505044668912888, |
|
"learning_rate": 6.59604201200412e-05, |
|
"loss": 0.0734, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.6158868335146899, |
|
"grad_norm": 0.09885645657777786, |
|
"learning_rate": 6.563565334723134e-05, |
|
"loss": 0.0726, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 0.6169749727965179, |
|
"grad_norm": 0.10182873159646988, |
|
"learning_rate": 6.531129704273604e-05, |
|
"loss": 0.0907, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 0.6180631120783461, |
|
"grad_norm": 0.10983593761920929, |
|
"learning_rate": 6.498735508086093e-05, |
|
"loss": 0.0774, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 0.6191512513601741, |
|
"grad_norm": 0.13069598376750946, |
|
"learning_rate": 6.466383133096267e-05, |
|
"loss": 0.1334, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 0.6202393906420022, |
|
"grad_norm": 0.10764119029045105, |
|
"learning_rate": 6.434072965740242e-05, |
|
"loss": 0.0921, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.6213275299238302, |
|
"grad_norm": 0.10900213569402695, |
|
"learning_rate": 6.40180539194999e-05, |
|
"loss": 0.0842, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 0.6224156692056583, |
|
"grad_norm": 0.13261805474758148, |
|
"learning_rate": 6.369580797148718e-05, |
|
"loss": 0.1045, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 0.6235038084874864, |
|
"grad_norm": 0.13680216670036316, |
|
"learning_rate": 6.337399566246257e-05, |
|
"loss": 0.1367, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 0.6245919477693145, |
|
"grad_norm": 0.13827918469905853, |
|
"learning_rate": 6.305262083634488e-05, |
|
"loss": 0.1063, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 0.6256800870511425, |
|
"grad_norm": 0.0938330814242363, |
|
"learning_rate": 6.273168733182722e-05, |
|
"loss": 0.0686, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.6267682263329706, |
|
"grad_norm": 0.14945276081562042, |
|
"learning_rate": 6.241119898233144e-05, |
|
"loss": 0.1373, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.6278563656147987, |
|
"grad_norm": 0.10897387564182281, |
|
"learning_rate": 6.209115961596208e-05, |
|
"loss": 0.068, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 0.6289445048966268, |
|
"grad_norm": 0.11899581551551819, |
|
"learning_rate": 6.177157305546078e-05, |
|
"loss": 0.0812, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 0.6300326441784548, |
|
"grad_norm": 0.12964366376399994, |
|
"learning_rate": 6.145244311816063e-05, |
|
"loss": 0.0919, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 0.6311207834602829, |
|
"grad_norm": 0.14867788553237915, |
|
"learning_rate": 6.113377361594049e-05, |
|
"loss": 0.1079, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.6322089227421109, |
|
"grad_norm": 0.13811950385570526, |
|
"learning_rate": 6.0815568355179556e-05, |
|
"loss": 0.0871, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 0.6332970620239391, |
|
"grad_norm": 0.14532406628131866, |
|
"learning_rate": 6.0497831136711836e-05, |
|
"loss": 0.0912, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 0.6343852013057671, |
|
"grad_norm": 0.19129644334316254, |
|
"learning_rate": 6.018056575578075e-05, |
|
"loss": 0.1137, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 0.6354733405875952, |
|
"grad_norm": 0.16450344026088715, |
|
"learning_rate": 5.986377600199371e-05, |
|
"loss": 0.072, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 0.6365614798694232, |
|
"grad_norm": 0.1428171992301941, |
|
"learning_rate": 5.9547465659277215e-05, |
|
"loss": 0.0671, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.6376496191512514, |
|
"grad_norm": 0.18160395324230194, |
|
"learning_rate": 5.923163850583113e-05, |
|
"loss": 0.1049, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 0.6387377584330794, |
|
"grad_norm": 0.18718522787094116, |
|
"learning_rate": 5.8916298314083915e-05, |
|
"loss": 0.1037, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 0.6398258977149075, |
|
"grad_norm": 0.3233761489391327, |
|
"learning_rate": 5.860144885064751e-05, |
|
"loss": 0.1401, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.6409140369967355, |
|
"grad_norm": 0.27942174673080444, |
|
"learning_rate": 5.828709387627218e-05, |
|
"loss": 0.1464, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 0.6420021762785637, |
|
"grad_norm": 0.26235440373420715, |
|
"learning_rate": 5.797323714580192e-05, |
|
"loss": 0.1238, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.6430903155603918, |
|
"grad_norm": 0.3004370927810669, |
|
"learning_rate": 5.765988240812921e-05, |
|
"loss": 0.1483, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 0.6441784548422198, |
|
"grad_norm": 0.33811435103416443, |
|
"learning_rate": 5.73470334061505e-05, |
|
"loss": 0.1895, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 0.6452665941240479, |
|
"grad_norm": 0.352417528629303, |
|
"learning_rate": 5.7034693876721376e-05, |
|
"loss": 0.1148, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 0.6463547334058759, |
|
"grad_norm": 0.296340674161911, |
|
"learning_rate": 5.6722867550612116e-05, |
|
"loss": 0.1647, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 0.6474428726877041, |
|
"grad_norm": 0.47691333293914795, |
|
"learning_rate": 5.6411558152462894e-05, |
|
"loss": 0.2066, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.6485310119695321, |
|
"grad_norm": 0.5979732275009155, |
|
"learning_rate": 5.6100769400739383e-05, |
|
"loss": 0.291, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 0.6496191512513602, |
|
"grad_norm": 0.4951854944229126, |
|
"learning_rate": 5.579050500768836e-05, |
|
"loss": 0.2417, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 0.6507072905331882, |
|
"grad_norm": 0.5365352630615234, |
|
"learning_rate": 5.54807686792933e-05, |
|
"loss": 0.2438, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 0.6517954298150164, |
|
"grad_norm": 0.6378306150436401, |
|
"learning_rate": 5.5171564115230254e-05, |
|
"loss": 0.2436, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 0.6528835690968444, |
|
"grad_norm": 0.5462427735328674, |
|
"learning_rate": 5.486289500882355e-05, |
|
"loss": 0.1536, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6539717083786725, |
|
"grad_norm": 0.0600992813706398, |
|
"learning_rate": 5.4554765047001613e-05, |
|
"loss": 0.0662, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 0.6550598476605005, |
|
"grad_norm": 0.06883436441421509, |
|
"learning_rate": 5.424717791025302e-05, |
|
"loss": 0.0791, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 0.6561479869423286, |
|
"grad_norm": 0.07644320279359818, |
|
"learning_rate": 5.394013727258254e-05, |
|
"loss": 0.0863, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 0.6572361262241567, |
|
"grad_norm": 0.07377026975154877, |
|
"learning_rate": 5.363364680146725e-05, |
|
"loss": 0.0669, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 0.6583242655059848, |
|
"grad_norm": 0.06373579055070877, |
|
"learning_rate": 5.332771015781275e-05, |
|
"loss": 0.0537, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.6594124047878128, |
|
"grad_norm": 0.05605285242199898, |
|
"learning_rate": 5.302233099590928e-05, |
|
"loss": 0.0505, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 0.6605005440696409, |
|
"grad_norm": 0.0761309266090393, |
|
"learning_rate": 5.271751296338823e-05, |
|
"loss": 0.079, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 0.661588683351469, |
|
"grad_norm": 0.08843579143285751, |
|
"learning_rate": 5.2413259701178505e-05, |
|
"loss": 0.0662, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 0.6626768226332971, |
|
"grad_norm": 0.07351501286029816, |
|
"learning_rate": 5.210957484346314e-05, |
|
"loss": 0.0533, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 0.6637649619151251, |
|
"grad_norm": 0.08167769014835358, |
|
"learning_rate": 5.180646201763577e-05, |
|
"loss": 0.0796, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.6648531011969532, |
|
"grad_norm": 0.09149360656738281, |
|
"learning_rate": 5.150392484425728e-05, |
|
"loss": 0.1115, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 0.6659412404787813, |
|
"grad_norm": 0.08288878947496414, |
|
"learning_rate": 5.120196693701267e-05, |
|
"loss": 0.0761, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 0.6670293797606094, |
|
"grad_norm": 0.07389149814844131, |
|
"learning_rate": 5.090059190266779e-05, |
|
"loss": 0.0645, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 0.6681175190424374, |
|
"grad_norm": 0.08279106020927429, |
|
"learning_rate": 5.059980334102637e-05, |
|
"loss": 0.0772, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 0.6692056583242655, |
|
"grad_norm": 0.09307517111301422, |
|
"learning_rate": 5.0299604844886985e-05, |
|
"loss": 0.0823, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.6702937976060935, |
|
"grad_norm": 0.09981580078601837, |
|
"learning_rate": 5.000000000000002e-05, |
|
"loss": 0.0991, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.6713819368879217, |
|
"grad_norm": 0.11640693992376328, |
|
"learning_rate": 4.9700992385024934e-05, |
|
"loss": 0.1061, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 0.6724700761697497, |
|
"grad_norm": 0.10669746994972229, |
|
"learning_rate": 4.940258557148765e-05, |
|
"loss": 0.0897, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 0.6735582154515778, |
|
"grad_norm": 0.1135135293006897, |
|
"learning_rate": 4.9104783123737566e-05, |
|
"loss": 0.1121, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 0.6746463547334058, |
|
"grad_norm": 0.08616163581609726, |
|
"learning_rate": 4.880758859890536e-05, |
|
"loss": 0.0684, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.675734494015234, |
|
"grad_norm": 0.10301216691732407, |
|
"learning_rate": 4.851100554686021e-05, |
|
"loss": 0.0894, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 0.676822633297062, |
|
"grad_norm": 0.10738655179738998, |
|
"learning_rate": 4.821503751016746e-05, |
|
"loss": 0.0919, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 0.6779107725788901, |
|
"grad_norm": 0.10355032980442047, |
|
"learning_rate": 4.791968802404648e-05, |
|
"loss": 0.0855, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 0.6789989118607181, |
|
"grad_norm": 0.10595700144767761, |
|
"learning_rate": 4.762496061632814e-05, |
|
"loss": 0.0903, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 0.6800870511425462, |
|
"grad_norm": 0.12492071092128754, |
|
"learning_rate": 4.733085880741301e-05, |
|
"loss": 0.1232, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.6811751904243744, |
|
"grad_norm": 0.1137222945690155, |
|
"learning_rate": 4.7037386110228985e-05, |
|
"loss": 0.0958, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 0.6822633297062024, |
|
"grad_norm": 0.13245636224746704, |
|
"learning_rate": 4.6744546030189486e-05, |
|
"loss": 0.0981, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 0.6833514689880305, |
|
"grad_norm": 0.12368131428956985, |
|
"learning_rate": 4.645234206515171e-05, |
|
"loss": 0.0811, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 0.6844396082698585, |
|
"grad_norm": 0.10623182356357574, |
|
"learning_rate": 4.6160777705374524e-05, |
|
"loss": 0.0723, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 0.6855277475516867, |
|
"grad_norm": 0.13129866123199463, |
|
"learning_rate": 4.586985643347717e-05, |
|
"loss": 0.1147, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.6866158868335147, |
|
"grad_norm": 0.11502306908369064, |
|
"learning_rate": 4.5579581724397255e-05, |
|
"loss": 0.0715, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 0.6877040261153428, |
|
"grad_norm": 0.1408235728740692, |
|
"learning_rate": 4.5289957045349653e-05, |
|
"loss": 0.0919, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 0.6887921653971708, |
|
"grad_norm": 0.1587967872619629, |
|
"learning_rate": 4.5000985855784746e-05, |
|
"loss": 0.109, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 0.6898803046789989, |
|
"grad_norm": 0.19859641790390015, |
|
"learning_rate": 4.471267160734731e-05, |
|
"loss": 0.0959, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 0.690968443960827, |
|
"grad_norm": 0.16058838367462158, |
|
"learning_rate": 4.442501774383515e-05, |
|
"loss": 0.1012, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.6920565832426551, |
|
"grad_norm": 0.2084600329399109, |
|
"learning_rate": 4.413802770115816e-05, |
|
"loss": 0.1343, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 0.6931447225244831, |
|
"grad_norm": 0.23033252358436584, |
|
"learning_rate": 4.385170490729712e-05, |
|
"loss": 0.1517, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 0.6942328618063112, |
|
"grad_norm": 0.2856910228729248, |
|
"learning_rate": 4.3566052782262735e-05, |
|
"loss": 0.1669, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 0.6953210010881393, |
|
"grad_norm": 0.19062520563602448, |
|
"learning_rate": 4.328107473805487e-05, |
|
"loss": 0.0654, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 0.6964091403699674, |
|
"grad_norm": 0.3172631561756134, |
|
"learning_rate": 4.2996774178621736e-05, |
|
"loss": 0.1788, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.6974972796517954, |
|
"grad_norm": 0.4395265281200409, |
|
"learning_rate": 4.271315449981934e-05, |
|
"loss": 0.1254, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 0.6985854189336235, |
|
"grad_norm": 0.3653918206691742, |
|
"learning_rate": 4.2430219089370823e-05, |
|
"loss": 0.2148, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 0.6996735582154516, |
|
"grad_norm": 0.45207682251930237, |
|
"learning_rate": 4.2147971326825966e-05, |
|
"loss": 0.2424, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 0.7007616974972797, |
|
"grad_norm": 0.33358728885650635, |
|
"learning_rate": 4.1866414583520877e-05, |
|
"loss": 0.1578, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 0.7018498367791077, |
|
"grad_norm": 0.2964087426662445, |
|
"learning_rate": 4.158555222253771e-05, |
|
"loss": 0.0704, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.7029379760609358, |
|
"grad_norm": 0.3286013603210449, |
|
"learning_rate": 4.130538759866457e-05, |
|
"loss": 0.2085, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 0.7040261153427638, |
|
"grad_norm": 0.3953593671321869, |
|
"learning_rate": 4.102592405835536e-05, |
|
"loss": 0.2185, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 0.705114254624592, |
|
"grad_norm": 0.46906566619873047, |
|
"learning_rate": 4.074716493968975e-05, |
|
"loss": 0.2676, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 0.70620239390642, |
|
"grad_norm": 0.545005738735199, |
|
"learning_rate": 4.046911357233343e-05, |
|
"loss": 0.2487, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 0.7072905331882481, |
|
"grad_norm": 0.5767453908920288, |
|
"learning_rate": 4.019177327749822e-05, |
|
"loss": 0.2432, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.7083786724700761, |
|
"grad_norm": 0.058534830808639526, |
|
"learning_rate": 3.991514736790258e-05, |
|
"loss": 0.0652, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 0.7094668117519043, |
|
"grad_norm": 0.06634719669818878, |
|
"learning_rate": 3.963923914773187e-05, |
|
"loss": 0.0712, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 0.7105549510337323, |
|
"grad_norm": 0.0771845281124115, |
|
"learning_rate": 3.936405191259891e-05, |
|
"loss": 0.0702, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 0.7116430903155604, |
|
"grad_norm": 0.06693675369024277, |
|
"learning_rate": 3.9089588949504655e-05, |
|
"loss": 0.0785, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 0.7127312295973884, |
|
"grad_norm": 0.07800418138504028, |
|
"learning_rate": 3.8815853536798904e-05, |
|
"loss": 0.093, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.7138193688792165, |
|
"grad_norm": 0.06311319023370743, |
|
"learning_rate": 3.854284894414122e-05, |
|
"loss": 0.0619, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 0.7149075081610446, |
|
"grad_norm": 0.08463241159915924, |
|
"learning_rate": 3.82705784324618e-05, |
|
"loss": 0.0799, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 0.7159956474428727, |
|
"grad_norm": 0.06584708392620087, |
|
"learning_rate": 3.79990452539225e-05, |
|
"loss": 0.0508, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 0.7170837867247007, |
|
"grad_norm": 0.07503578066825867, |
|
"learning_rate": 3.772825265187802e-05, |
|
"loss": 0.0651, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 0.7181719260065288, |
|
"grad_norm": 0.09440509229898453, |
|
"learning_rate": 3.7458203860837234e-05, |
|
"loss": 0.0824, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.719260065288357, |
|
"grad_norm": 0.0794801339507103, |
|
"learning_rate": 3.7188902106424416e-05, |
|
"loss": 0.0752, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 0.720348204570185, |
|
"grad_norm": 0.08709513396024704, |
|
"learning_rate": 3.692035060534088e-05, |
|
"loss": 0.0858, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 0.721436343852013, |
|
"grad_norm": 0.08292333036661148, |
|
"learning_rate": 3.665255256532638e-05, |
|
"loss": 0.0747, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 0.7225244831338411, |
|
"grad_norm": 0.08945606648921967, |
|
"learning_rate": 3.638551118512089e-05, |
|
"loss": 0.0875, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 0.7236126224156693, |
|
"grad_norm": 0.09742715954780579, |
|
"learning_rate": 3.611922965442648e-05, |
|
"loss": 0.0894, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.7247007616974973, |
|
"grad_norm": 0.09512501209974289, |
|
"learning_rate": 3.5853711153868965e-05, |
|
"loss": 0.0831, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 0.7257889009793254, |
|
"grad_norm": 0.12309622764587402, |
|
"learning_rate": 3.558895885496023e-05, |
|
"loss": 0.1218, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 0.7268770402611534, |
|
"grad_norm": 0.09456098824739456, |
|
"learning_rate": 3.53249759200601e-05, |
|
"loss": 0.0957, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 0.7279651795429815, |
|
"grad_norm": 0.10691989958286285, |
|
"learning_rate": 3.506176550233863e-05, |
|
"loss": 0.1067, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 0.7290533188248096, |
|
"grad_norm": 0.11746580898761749, |
|
"learning_rate": 3.479933074573858e-05, |
|
"loss": 0.1002, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.7301414581066377, |
|
"grad_norm": 0.0943666398525238, |
|
"learning_rate": 3.4537674784937614e-05, |
|
"loss": 0.0842, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 0.7312295973884657, |
|
"grad_norm": 0.0884823352098465, |
|
"learning_rate": 3.427680074531113e-05, |
|
"loss": 0.0642, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 0.7323177366702938, |
|
"grad_norm": 0.10647040605545044, |
|
"learning_rate": 3.401671174289469e-05, |
|
"loss": 0.0805, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 0.7334058759521219, |
|
"grad_norm": 0.11635477840900421, |
|
"learning_rate": 3.3757410884346894e-05, |
|
"loss": 0.1075, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 0.73449401523395, |
|
"grad_norm": 0.10123847424983978, |
|
"learning_rate": 3.3498901266912396e-05, |
|
"loss": 0.0783, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.735582154515778, |
|
"grad_norm": 0.1347595453262329, |
|
"learning_rate": 3.324118597838464e-05, |
|
"loss": 0.1216, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 0.7366702937976061, |
|
"grad_norm": 0.11851736158132553, |
|
"learning_rate": 3.298426809706928e-05, |
|
"loss": 0.0891, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 0.7377584330794341, |
|
"grad_norm": 0.1312292367219925, |
|
"learning_rate": 3.2728150691747115e-05, |
|
"loss": 0.1008, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 0.7388465723612623, |
|
"grad_norm": 0.12228815257549286, |
|
"learning_rate": 3.2472836821637744e-05, |
|
"loss": 0.0696, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 0.7399347116430903, |
|
"grad_norm": 0.12540924549102783, |
|
"learning_rate": 3.2218329536362704e-05, |
|
"loss": 0.0801, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.7410228509249184, |
|
"grad_norm": 0.12973323464393616, |
|
"learning_rate": 3.196463187590929e-05, |
|
"loss": 0.0712, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 0.7421109902067464, |
|
"grad_norm": 0.15368451178073883, |
|
"learning_rate": 3.1711746870594086e-05, |
|
"loss": 0.1033, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 0.7431991294885746, |
|
"grad_norm": 0.21176296472549438, |
|
"learning_rate": 3.145967754102691e-05, |
|
"loss": 0.1367, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 0.7442872687704026, |
|
"grad_norm": 0.18676644563674927, |
|
"learning_rate": 3.120842689807468e-05, |
|
"loss": 0.1081, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 0.7453754080522307, |
|
"grad_norm": 0.1671050786972046, |
|
"learning_rate": 3.0957997942825336e-05, |
|
"loss": 0.0898, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.7464635473340587, |
|
"grad_norm": 0.187219500541687, |
|
"learning_rate": 3.070839366655215e-05, |
|
"loss": 0.1019, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 0.7475516866158868, |
|
"grad_norm": 0.17203396558761597, |
|
"learning_rate": 3.0459617050677868e-05, |
|
"loss": 0.0865, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 0.7486398258977149, |
|
"grad_norm": 0.23022624850273132, |
|
"learning_rate": 3.021167106673928e-05, |
|
"loss": 0.1252, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 0.749727965179543, |
|
"grad_norm": 0.2407008409500122, |
|
"learning_rate": 2.996455867635155e-05, |
|
"loss": 0.1142, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 0.750816104461371, |
|
"grad_norm": 0.32173025608062744, |
|
"learning_rate": 2.9718282831172883e-05, |
|
"loss": 0.1342, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.750816104461371, |
|
"eval_loss": 0.10566242039203644, |
|
"eval_runtime": 24.3899, |
|
"eval_samples_per_second": 15.867, |
|
"eval_steps_per_second": 7.954, |
|
"step": 690 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 919, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 230, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.744651772592128e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|