|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0006826715212198, |
|
"eval_steps": 275, |
|
"global_step": 1099, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0009102286949596086, |
|
"grad_norm": 0.419871062040329, |
|
"learning_rate": 2e-05, |
|
"loss": 2.8811, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0009102286949596086, |
|
"eval_loss": 2.6547484397888184, |
|
"eval_runtime": 203.8787, |
|
"eval_samples_per_second": 9.079, |
|
"eval_steps_per_second": 4.542, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0018204573899192173, |
|
"grad_norm": 0.47298941016197205, |
|
"learning_rate": 4e-05, |
|
"loss": 2.8541, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0027306860848788257, |
|
"grad_norm": 0.48272326588630676, |
|
"learning_rate": 6e-05, |
|
"loss": 2.6363, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0036409147798384346, |
|
"grad_norm": 0.4156210422515869, |
|
"learning_rate": 8e-05, |
|
"loss": 2.666, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.004551143474798043, |
|
"grad_norm": 0.375728964805603, |
|
"learning_rate": 0.0001, |
|
"loss": 2.7432, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0054613721697576514, |
|
"grad_norm": 0.3113997280597687, |
|
"learning_rate": 0.00012, |
|
"loss": 2.7398, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.00637160086471726, |
|
"grad_norm": 0.4520895183086395, |
|
"learning_rate": 0.00014, |
|
"loss": 2.7017, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.007281829559676869, |
|
"grad_norm": 0.5530128479003906, |
|
"learning_rate": 0.00016, |
|
"loss": 2.6855, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.008192058254636477, |
|
"grad_norm": 0.43422171473503113, |
|
"learning_rate": 0.00018, |
|
"loss": 2.6454, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.009102286949596085, |
|
"grad_norm": 0.3173878490924835, |
|
"learning_rate": 0.0002, |
|
"loss": 2.6638, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.010012515644555695, |
|
"grad_norm": 0.3479019105434418, |
|
"learning_rate": 0.00019999958388469571, |
|
"loss": 2.5689, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.010922744339515303, |
|
"grad_norm": 0.36122018098831177, |
|
"learning_rate": 0.00019999833554224577, |
|
"loss": 2.4933, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.011832973034474913, |
|
"grad_norm": 0.3206821382045746, |
|
"learning_rate": 0.00019999625498303932, |
|
"loss": 2.5661, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.01274320172943452, |
|
"grad_norm": 0.38150081038475037, |
|
"learning_rate": 0.00019999334222439147, |
|
"loss": 2.5545, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.013653430424394129, |
|
"grad_norm": 0.3939971625804901, |
|
"learning_rate": 0.00019998959729054295, |
|
"loss": 2.5737, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.014563659119353738, |
|
"grad_norm": 0.3015269339084625, |
|
"learning_rate": 0.0001999850202126604, |
|
"loss": 2.524, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.015473887814313346, |
|
"grad_norm": 0.3177410662174225, |
|
"learning_rate": 0.00019997961102883552, |
|
"loss": 2.4203, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.016384116509272954, |
|
"grad_norm": 0.3253481388092041, |
|
"learning_rate": 0.00019997336978408531, |
|
"loss": 2.5163, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.017294345204232564, |
|
"grad_norm": 0.3166387975215912, |
|
"learning_rate": 0.00019996629653035126, |
|
"loss": 2.2021, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.01820457389919217, |
|
"grad_norm": 0.32843250036239624, |
|
"learning_rate": 0.00019995839132649917, |
|
"loss": 2.4009, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01911480259415178, |
|
"grad_norm": 0.36471301317214966, |
|
"learning_rate": 0.00019994965423831854, |
|
"loss": 2.4734, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.02002503128911139, |
|
"grad_norm": 0.36941981315612793, |
|
"learning_rate": 0.0001999400853385221, |
|
"loss": 2.4731, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.020935259984071, |
|
"grad_norm": 0.3467971682548523, |
|
"learning_rate": 0.0001999296847067452, |
|
"loss": 2.4294, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.021845488679030606, |
|
"grad_norm": 0.337288498878479, |
|
"learning_rate": 0.00019991845242954505, |
|
"loss": 2.0426, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.022755717373990215, |
|
"grad_norm": 0.36415189504623413, |
|
"learning_rate": 0.00019990638860040006, |
|
"loss": 2.2803, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.023665946068949825, |
|
"grad_norm": 0.37340012192726135, |
|
"learning_rate": 0.00019989349331970923, |
|
"loss": 2.2841, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.02457617476390943, |
|
"grad_norm": 0.3849464952945709, |
|
"learning_rate": 0.00019987976669479088, |
|
"loss": 2.206, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.02548640345886904, |
|
"grad_norm": 0.41922691464424133, |
|
"learning_rate": 0.00019986520883988232, |
|
"loss": 2.3736, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.02639663215382865, |
|
"grad_norm": 0.41973674297332764, |
|
"learning_rate": 0.0001998498198761384, |
|
"loss": 2.3605, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.027306860848788257, |
|
"grad_norm": 0.3731151223182678, |
|
"learning_rate": 0.00019983359993163078, |
|
"loss": 2.0799, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.028217089543747867, |
|
"grad_norm": 0.41079527139663696, |
|
"learning_rate": 0.00019981654914134686, |
|
"loss": 2.3176, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.029127318238707477, |
|
"grad_norm": 0.43309271335601807, |
|
"learning_rate": 0.00019979866764718843, |
|
"loss": 2.3306, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.030037546933667083, |
|
"grad_norm": 0.45209765434265137, |
|
"learning_rate": 0.0001997799555979709, |
|
"loss": 2.3582, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.030947775628626693, |
|
"grad_norm": 0.4335600435733795, |
|
"learning_rate": 0.00019976041314942155, |
|
"loss": 2.1077, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.0318580043235863, |
|
"grad_norm": 0.43455106019973755, |
|
"learning_rate": 0.0001997400404641787, |
|
"loss": 2.2497, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.03276823301854591, |
|
"grad_norm": 0.4201047718524933, |
|
"learning_rate": 0.00019971883771179003, |
|
"loss": 2.1547, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.03367846171350552, |
|
"grad_norm": 0.43784937262535095, |
|
"learning_rate": 0.00019969680506871137, |
|
"loss": 2.3883, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.03458869040846513, |
|
"grad_norm": 0.4830430746078491, |
|
"learning_rate": 0.00019967394271830504, |
|
"loss": 2.5713, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.03549891910342474, |
|
"grad_norm": 0.4729761779308319, |
|
"learning_rate": 0.00019965025085083858, |
|
"loss": 2.3565, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.03640914779838434, |
|
"grad_norm": 0.43940603733062744, |
|
"learning_rate": 0.000199625729663483, |
|
"loss": 2.1644, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.03731937649334395, |
|
"grad_norm": 0.5039120316505432, |
|
"learning_rate": 0.00019960037936031104, |
|
"loss": 2.3641, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.03822960518830356, |
|
"grad_norm": 0.487318217754364, |
|
"learning_rate": 0.00019957420015229572, |
|
"loss": 2.4128, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.03913983388326317, |
|
"grad_norm": 0.5216456651687622, |
|
"learning_rate": 0.00019954719225730847, |
|
"loss": 2.3659, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.04005006257822278, |
|
"grad_norm": 0.4877496659755707, |
|
"learning_rate": 0.00019951935590011718, |
|
"loss": 2.2571, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.04096029127318239, |
|
"grad_norm": 0.4974242150783539, |
|
"learning_rate": 0.0001994906913123846, |
|
"loss": 2.1847, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.041870519968142, |
|
"grad_norm": 0.5708010196685791, |
|
"learning_rate": 0.00019946119873266613, |
|
"loss": 2.6954, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.0427807486631016, |
|
"grad_norm": 0.5892224311828613, |
|
"learning_rate": 0.00019943087840640814, |
|
"loss": 2.5149, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.04369097735806121, |
|
"grad_norm": 0.6740038394927979, |
|
"learning_rate": 0.0001993997305859456, |
|
"loss": 2.6992, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.04460120605302082, |
|
"grad_norm": 0.7869267463684082, |
|
"learning_rate": 0.0001993677555305002, |
|
"loss": 2.5103, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.04551143474798043, |
|
"grad_norm": 1.244491457939148, |
|
"learning_rate": 0.00019933495350617813, |
|
"loss": 2.5167, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.04642166344294004, |
|
"grad_norm": 1.1311120986938477, |
|
"learning_rate": 0.00019930132478596796, |
|
"loss": 2.7147, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.04733189213789965, |
|
"grad_norm": 0.5104111433029175, |
|
"learning_rate": 0.00019926686964973813, |
|
"loss": 2.7642, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.04824212083285925, |
|
"grad_norm": 0.4251229465007782, |
|
"learning_rate": 0.00019923158838423482, |
|
"loss": 2.4562, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.04915234952781886, |
|
"grad_norm": 0.45901840925216675, |
|
"learning_rate": 0.00019919548128307954, |
|
"loss": 2.6106, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.05006257822277847, |
|
"grad_norm": 0.4177067279815674, |
|
"learning_rate": 0.00019915854864676664, |
|
"loss": 2.4952, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.05097280691773808, |
|
"grad_norm": 0.411510705947876, |
|
"learning_rate": 0.00019912079078266085, |
|
"loss": 2.6141, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.05188303561269769, |
|
"grad_norm": 0.42673829197883606, |
|
"learning_rate": 0.0001990822080049946, |
|
"loss": 2.5089, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.0527932643076573, |
|
"grad_norm": 0.3956531882286072, |
|
"learning_rate": 0.0001990428006348656, |
|
"loss": 2.4539, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.053703493002616905, |
|
"grad_norm": 0.3687964975833893, |
|
"learning_rate": 0.00019900256900023413, |
|
"loss": 2.3752, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.054613721697576514, |
|
"grad_norm": 0.34918078780174255, |
|
"learning_rate": 0.00019896151343592008, |
|
"loss": 2.2978, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.055523950392536124, |
|
"grad_norm": 0.3795469403266907, |
|
"learning_rate": 0.00019891963428360043, |
|
"loss": 2.4324, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.056434179087495734, |
|
"grad_norm": 0.37362977862358093, |
|
"learning_rate": 0.00019887693189180633, |
|
"loss": 2.3966, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.057344407782455344, |
|
"grad_norm": 0.3695915639400482, |
|
"learning_rate": 0.00019883340661592015, |
|
"loss": 2.3188, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.05825463647741495, |
|
"grad_norm": 0.376068651676178, |
|
"learning_rate": 0.00019878905881817252, |
|
"loss": 2.3933, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.059164865172374556, |
|
"grad_norm": 0.3845527470111847, |
|
"learning_rate": 0.00019874388886763944, |
|
"loss": 2.5612, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.060075093867334166, |
|
"grad_norm": 0.3678928315639496, |
|
"learning_rate": 0.00019869789714023906, |
|
"loss": 2.4174, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.060985322562293776, |
|
"grad_norm": 0.3534240126609802, |
|
"learning_rate": 0.00019865108401872857, |
|
"loss": 2.2843, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.061895551257253385, |
|
"grad_norm": 0.3559145927429199, |
|
"learning_rate": 0.00019860344989270113, |
|
"loss": 2.0911, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.062805779952213, |
|
"grad_norm": 0.3940396308898926, |
|
"learning_rate": 0.0001985549951585825, |
|
"loss": 2.2796, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.0637160086471726, |
|
"grad_norm": 0.39386069774627686, |
|
"learning_rate": 0.00019850572021962788, |
|
"loss": 2.1712, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.06462623734213221, |
|
"grad_norm": 0.3923673629760742, |
|
"learning_rate": 0.00019845562548591826, |
|
"loss": 2.1665, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.06553646603709182, |
|
"grad_norm": 0.3916044533252716, |
|
"learning_rate": 0.00019840471137435746, |
|
"loss": 2.267, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.06644669473205143, |
|
"grad_norm": 0.4030390679836273, |
|
"learning_rate": 0.00019835297830866826, |
|
"loss": 2.2662, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.06735692342701104, |
|
"grad_norm": 0.4568188786506653, |
|
"learning_rate": 0.00019830042671938904, |
|
"loss": 2.4341, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.06826715212197064, |
|
"grad_norm": 0.4151420593261719, |
|
"learning_rate": 0.00019824705704387028, |
|
"loss": 2.2547, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.06917738081693026, |
|
"grad_norm": 0.4141649007797241, |
|
"learning_rate": 0.00019819286972627066, |
|
"loss": 2.157, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.07008760951188986, |
|
"grad_norm": 0.3980599343776703, |
|
"learning_rate": 0.00019813786521755372, |
|
"loss": 2.0681, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.07099783820684948, |
|
"grad_norm": 0.41139495372772217, |
|
"learning_rate": 0.00019808204397548377, |
|
"loss": 2.2297, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.07190806690180908, |
|
"grad_norm": 0.4236723482608795, |
|
"learning_rate": 0.0001980254064646223, |
|
"loss": 2.1775, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.07281829559676868, |
|
"grad_norm": 0.413194864988327, |
|
"learning_rate": 0.00019796795315632395, |
|
"loss": 2.1926, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0737285242917283, |
|
"grad_norm": 0.4185364246368408, |
|
"learning_rate": 0.0001979096845287328, |
|
"loss": 2.0727, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.0746387529866879, |
|
"grad_norm": 0.41983237862586975, |
|
"learning_rate": 0.00019785060106677818, |
|
"loss": 2.3075, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.07554898168164752, |
|
"grad_norm": 0.44238418340682983, |
|
"learning_rate": 0.00019779070326217074, |
|
"loss": 2.2824, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.07645921037660712, |
|
"grad_norm": 0.45874810218811035, |
|
"learning_rate": 0.00019772999161339833, |
|
"loss": 2.2601, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.07736943907156674, |
|
"grad_norm": 0.4516693651676178, |
|
"learning_rate": 0.00019766846662572191, |
|
"loss": 2.314, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.07827966776652634, |
|
"grad_norm": 0.43325892090797424, |
|
"learning_rate": 0.00019760612881117125, |
|
"loss": 2.1802, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.07918989646148594, |
|
"grad_norm": 0.4844377636909485, |
|
"learning_rate": 0.00019754297868854073, |
|
"loss": 2.3255, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.08010012515644556, |
|
"grad_norm": 0.44133615493774414, |
|
"learning_rate": 0.00019747901678338496, |
|
"loss": 2.2873, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.08101035385140516, |
|
"grad_norm": 0.5170537829399109, |
|
"learning_rate": 0.00019741424362801452, |
|
"loss": 2.3539, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.08192058254636478, |
|
"grad_norm": 0.46766072511672974, |
|
"learning_rate": 0.00019734865976149145, |
|
"loss": 2.2522, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.08283081124132438, |
|
"grad_norm": 0.4890718162059784, |
|
"learning_rate": 0.00019728226572962473, |
|
"loss": 2.4453, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.083741039936284, |
|
"grad_norm": 0.481082946062088, |
|
"learning_rate": 0.00019721506208496585, |
|
"loss": 2.2735, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.0846512686312436, |
|
"grad_norm": 0.4898594319820404, |
|
"learning_rate": 0.00019714704938680408, |
|
"loss": 2.223, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.0855614973262032, |
|
"grad_norm": 0.48203563690185547, |
|
"learning_rate": 0.00019707822820116193, |
|
"loss": 2.2461, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.08647172602116282, |
|
"grad_norm": 0.5068606734275818, |
|
"learning_rate": 0.00019700859910079036, |
|
"loss": 2.1878, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.08738195471612242, |
|
"grad_norm": 0.5902338027954102, |
|
"learning_rate": 0.00019693816266516407, |
|
"loss": 2.6452, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.08829218341108204, |
|
"grad_norm": 0.6118764281272888, |
|
"learning_rate": 0.00019686691948047664, |
|
"loss": 2.4923, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.08920241210604164, |
|
"grad_norm": 0.6774008274078369, |
|
"learning_rate": 0.00019679487013963564, |
|
"loss": 2.7092, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.09011264080100125, |
|
"grad_norm": 0.8197199106216431, |
|
"learning_rate": 0.00019672201524225776, |
|
"loss": 2.7662, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.09102286949596086, |
|
"grad_norm": 1.4529683589935303, |
|
"learning_rate": 0.0001966483553946637, |
|
"loss": 2.7347, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.09193309819092046, |
|
"grad_norm": 1.3071473836898804, |
|
"learning_rate": 0.00019657389120987333, |
|
"loss": 2.8331, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.09284332688588008, |
|
"grad_norm": 0.5390121340751648, |
|
"learning_rate": 0.00019649862330760036, |
|
"loss": 2.724, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.09375355558083968, |
|
"grad_norm": 0.43149107694625854, |
|
"learning_rate": 0.00019642255231424729, |
|
"loss": 2.5445, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.0946637842757993, |
|
"grad_norm": 0.4529167115688324, |
|
"learning_rate": 0.00019634567886290025, |
|
"loss": 2.4586, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.0955740129707589, |
|
"grad_norm": 0.4370836317539215, |
|
"learning_rate": 0.00019626800359332362, |
|
"loss": 2.5472, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.0964842416657185, |
|
"grad_norm": 0.43084716796875, |
|
"learning_rate": 0.00019618952715195475, |
|
"loss": 2.5307, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.09739447036067812, |
|
"grad_norm": 0.4851230978965759, |
|
"learning_rate": 0.0001961102501918986, |
|
"loss": 2.3863, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.09830469905563773, |
|
"grad_norm": 0.39606913924217224, |
|
"learning_rate": 0.00019603017337292236, |
|
"loss": 2.4334, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.09921492775059734, |
|
"grad_norm": 0.3680226504802704, |
|
"learning_rate": 0.00019594929736144976, |
|
"loss": 2.3358, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.10012515644555695, |
|
"grad_norm": 0.43170568346977234, |
|
"learning_rate": 0.00019586762283055573, |
|
"loss": 2.3502, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.10103538514051655, |
|
"grad_norm": 0.4067252576351166, |
|
"learning_rate": 0.00019578515045996073, |
|
"loss": 2.4522, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.10194561383547616, |
|
"grad_norm": 0.3645339906215668, |
|
"learning_rate": 0.0001957018809360251, |
|
"loss": 2.3924, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.10285584253043577, |
|
"grad_norm": 0.4669637084007263, |
|
"learning_rate": 0.00019561781495174328, |
|
"loss": 2.3151, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.10376607122539538, |
|
"grad_norm": 0.39779335260391235, |
|
"learning_rate": 0.00019553295320673807, |
|
"loss": 2.286, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.10467629992035499, |
|
"grad_norm": 0.41239792108535767, |
|
"learning_rate": 0.00019544729640725498, |
|
"loss": 2.4233, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.1055865286153146, |
|
"grad_norm": 0.3813844621181488, |
|
"learning_rate": 0.0001953608452661561, |
|
"loss": 2.3497, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.1064967573102742, |
|
"grad_norm": 0.3577946424484253, |
|
"learning_rate": 0.0001952736005029142, |
|
"loss": 2.2817, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.10740698600523381, |
|
"grad_norm": 0.3643464744091034, |
|
"learning_rate": 0.00019518556284360696, |
|
"loss": 2.2296, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.10831721470019343, |
|
"grad_norm": 0.38557112216949463, |
|
"learning_rate": 0.00019509673302091075, |
|
"loss": 2.3039, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.10922744339515303, |
|
"grad_norm": 0.39795589447021484, |
|
"learning_rate": 0.00019500711177409454, |
|
"loss": 2.3028, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.11013767209011265, |
|
"grad_norm": 0.3612115681171417, |
|
"learning_rate": 0.00019491669984901379, |
|
"loss": 2.0678, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.11104790078507225, |
|
"grad_norm": 0.36631131172180176, |
|
"learning_rate": 0.00019482549799810413, |
|
"loss": 2.2505, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.11195812948003186, |
|
"grad_norm": 0.383908748626709, |
|
"learning_rate": 0.00019473350698037535, |
|
"loss": 2.1246, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.11286835817499147, |
|
"grad_norm": 0.4124641716480255, |
|
"learning_rate": 0.00019464072756140486, |
|
"loss": 2.2511, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.11377858686995107, |
|
"grad_norm": 0.3917684555053711, |
|
"learning_rate": 0.00019454716051333135, |
|
"loss": 2.2266, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.11468881556491069, |
|
"grad_norm": 0.40930524468421936, |
|
"learning_rate": 0.00019445280661484847, |
|
"loss": 2.1455, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.11559904425987029, |
|
"grad_norm": 0.4211941063404083, |
|
"learning_rate": 0.0001943576666511982, |
|
"loss": 2.1991, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.1165092729548299, |
|
"grad_norm": 0.4424692392349243, |
|
"learning_rate": 0.00019426174141416448, |
|
"loss": 2.1868, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.11741950164978951, |
|
"grad_norm": 0.41721439361572266, |
|
"learning_rate": 0.00019416503170206645, |
|
"loss": 2.3098, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.11832973034474911, |
|
"grad_norm": 0.40203043818473816, |
|
"learning_rate": 0.00019406753831975203, |
|
"loss": 2.1059, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.11923995903970873, |
|
"grad_norm": 0.4373216927051544, |
|
"learning_rate": 0.00019396926207859084, |
|
"loss": 2.0548, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.12015018773466833, |
|
"grad_norm": 0.46566876769065857, |
|
"learning_rate": 0.00019387020379646797, |
|
"loss": 2.2075, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.12106041642962795, |
|
"grad_norm": 0.4372277557849884, |
|
"learning_rate": 0.00019377036429777672, |
|
"loss": 2.1329, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.12197064512458755, |
|
"grad_norm": 0.42452472448349, |
|
"learning_rate": 0.0001936697444134119, |
|
"loss": 2.1308, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.12288087381954717, |
|
"grad_norm": 0.4379199743270874, |
|
"learning_rate": 0.0001935683449807631, |
|
"loss": 2.2978, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.12379110251450677, |
|
"grad_norm": 0.46032705903053284, |
|
"learning_rate": 0.0001934661668437073, |
|
"loss": 2.2349, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.12470133120946637, |
|
"grad_norm": 0.4644859731197357, |
|
"learning_rate": 0.00019336321085260236, |
|
"loss": 2.2485, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.125611559904426, |
|
"grad_norm": 0.4860181510448456, |
|
"learning_rate": 0.00019325947786427952, |
|
"loss": 2.2059, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.1265217885993856, |
|
"grad_norm": 0.46559709310531616, |
|
"learning_rate": 0.0001931549687420364, |
|
"loss": 2.2564, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.1274320172943452, |
|
"grad_norm": 0.47857019305229187, |
|
"learning_rate": 0.00019304968435562993, |
|
"loss": 2.1526, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.12834224598930483, |
|
"grad_norm": 0.5288405418395996, |
|
"learning_rate": 0.00019294362558126905, |
|
"loss": 2.399, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.12925247468426443, |
|
"grad_norm": 0.4651063084602356, |
|
"learning_rate": 0.00019283679330160726, |
|
"loss": 2.3092, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.13016270337922403, |
|
"grad_norm": 0.4940643310546875, |
|
"learning_rate": 0.00019272918840573558, |
|
"loss": 2.2838, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.13107293207418363, |
|
"grad_norm": 0.5267408490180969, |
|
"learning_rate": 0.00019262081178917482, |
|
"loss": 2.3948, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.13198316076914324, |
|
"grad_norm": 0.5252528190612793, |
|
"learning_rate": 0.0001925116643538684, |
|
"loss": 2.3055, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.13289338946410287, |
|
"grad_norm": 0.5565192699432373, |
|
"learning_rate": 0.00019240174700817464, |
|
"loss": 2.2392, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.13380361815906247, |
|
"grad_norm": 0.6170883774757385, |
|
"learning_rate": 0.00019229106066685937, |
|
"loss": 2.4212, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.13471384685402207, |
|
"grad_norm": 0.623285710811615, |
|
"learning_rate": 0.0001921796062510882, |
|
"loss": 2.5778, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.13562407554898168, |
|
"grad_norm": 0.7146514654159546, |
|
"learning_rate": 0.0001920673846884189, |
|
"loss": 2.5517, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.13653430424394128, |
|
"grad_norm": 1.1342458724975586, |
|
"learning_rate": 0.00019195439691279363, |
|
"loss": 2.7753, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.1374445329389009, |
|
"grad_norm": 0.5309027433395386, |
|
"learning_rate": 0.00019184064386453128, |
|
"loss": 2.6207, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.1383547616338605, |
|
"grad_norm": 0.5145514607429504, |
|
"learning_rate": 0.00019172612649031952, |
|
"loss": 2.605, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.13926499032882012, |
|
"grad_norm": 0.41491782665252686, |
|
"learning_rate": 0.00019161084574320696, |
|
"loss": 2.3562, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.14017521902377972, |
|
"grad_norm": 0.4266800880432129, |
|
"learning_rate": 0.00019149480258259533, |
|
"loss": 2.5725, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.14108544771873932, |
|
"grad_norm": 0.3961375951766968, |
|
"learning_rate": 0.00019137799797423126, |
|
"loss": 2.4722, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.14199567641369895, |
|
"grad_norm": 0.41011351346969604, |
|
"learning_rate": 0.00019126043289019852, |
|
"loss": 2.5218, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.14290590510865855, |
|
"grad_norm": 0.4531441032886505, |
|
"learning_rate": 0.00019114210830890969, |
|
"loss": 2.5233, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.14381613380361816, |
|
"grad_norm": 0.447412371635437, |
|
"learning_rate": 0.00019102302521509815, |
|
"loss": 2.624, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.14472636249857776, |
|
"grad_norm": 0.41504496335983276, |
|
"learning_rate": 0.00019090318459980986, |
|
"loss": 2.5468, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.14563659119353736, |
|
"grad_norm": 0.3992937505245209, |
|
"learning_rate": 0.00019078258746039507, |
|
"loss": 2.5191, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.146546819888497, |
|
"grad_norm": 0.37622761726379395, |
|
"learning_rate": 0.00019066123480050015, |
|
"loss": 2.3164, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.1474570485834566, |
|
"grad_norm": 0.3903200626373291, |
|
"learning_rate": 0.00019053912763005907, |
|
"loss": 2.1591, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.1483672772784162, |
|
"grad_norm": 0.4018140733242035, |
|
"learning_rate": 0.00019041626696528503, |
|
"loss": 2.3273, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.1492775059733758, |
|
"grad_norm": 0.4058266878128052, |
|
"learning_rate": 0.00019029265382866214, |
|
"loss": 2.4121, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.15018773466833543, |
|
"grad_norm": 0.41517823934555054, |
|
"learning_rate": 0.0001901682892489367, |
|
"loss": 2.3822, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.15109796336329503, |
|
"grad_norm": 0.40526044368743896, |
|
"learning_rate": 0.0001900431742611089, |
|
"loss": 2.3957, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.15200819205825464, |
|
"grad_norm": 0.39370712637901306, |
|
"learning_rate": 0.00018991730990642388, |
|
"loss": 2.2458, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.15291842075321424, |
|
"grad_norm": 0.37955179810523987, |
|
"learning_rate": 0.00018979069723236333, |
|
"loss": 2.246, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.15382864944817384, |
|
"grad_norm": 0.3796885907649994, |
|
"learning_rate": 0.00018966333729263674, |
|
"loss": 2.2288, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.15473887814313347, |
|
"grad_norm": 0.37720954418182373, |
|
"learning_rate": 0.00018953523114717245, |
|
"loss": 2.2816, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.15564910683809308, |
|
"grad_norm": 0.3945274353027344, |
|
"learning_rate": 0.00018940637986210906, |
|
"loss": 2.1191, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.15655933553305268, |
|
"grad_norm": 0.42212241888046265, |
|
"learning_rate": 0.0001892767845097864, |
|
"loss": 2.3253, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.15746956422801228, |
|
"grad_norm": 0.3840961456298828, |
|
"learning_rate": 0.00018914644616873657, |
|
"loss": 2.0736, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.15837979292297188, |
|
"grad_norm": 0.3896063268184662, |
|
"learning_rate": 0.0001890153659236753, |
|
"loss": 2.1033, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.15929002161793152, |
|
"grad_norm": 0.3891961872577667, |
|
"learning_rate": 0.00018888354486549237, |
|
"loss": 2.1627, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.16020025031289112, |
|
"grad_norm": 0.4236443042755127, |
|
"learning_rate": 0.00018875098409124302, |
|
"loss": 2.2984, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.16111047900785072, |
|
"grad_norm": 0.39355891942977905, |
|
"learning_rate": 0.0001886176847041386, |
|
"loss": 2.1823, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.16202070770281032, |
|
"grad_norm": 0.4226863384246826, |
|
"learning_rate": 0.00018848364781353744, |
|
"loss": 2.2797, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.16293093639776993, |
|
"grad_norm": 0.4181986451148987, |
|
"learning_rate": 0.0001883488745349355, |
|
"loss": 2.1999, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.16384116509272956, |
|
"grad_norm": 0.4138600528240204, |
|
"learning_rate": 0.0001882133659899573, |
|
"loss": 1.8991, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.16475139378768916, |
|
"grad_norm": 0.43699535727500916, |
|
"learning_rate": 0.00018807712330634642, |
|
"loss": 2.4304, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.16566162248264876, |
|
"grad_norm": 0.4426632523536682, |
|
"learning_rate": 0.0001879401476179562, |
|
"loss": 2.2588, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.16657185117760837, |
|
"grad_norm": 0.4601946473121643, |
|
"learning_rate": 0.0001878024400647402, |
|
"loss": 2.4086, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.167482079872568, |
|
"grad_norm": 0.41164615750312805, |
|
"learning_rate": 0.00018766400179274286, |
|
"loss": 1.9165, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.1683923085675276, |
|
"grad_norm": 0.42600390315055847, |
|
"learning_rate": 0.00018752483395408987, |
|
"loss": 2.0585, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.1693025372624872, |
|
"grad_norm": 0.4481484889984131, |
|
"learning_rate": 0.00018738493770697852, |
|
"loss": 2.2189, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.1702127659574468, |
|
"grad_norm": 0.47090041637420654, |
|
"learning_rate": 0.00018724431421566823, |
|
"loss": 2.3016, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.1711229946524064, |
|
"grad_norm": 0.4611455798149109, |
|
"learning_rate": 0.00018710296465047075, |
|
"loss": 2.2475, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.17203322334736604, |
|
"grad_norm": 0.4549846053123474, |
|
"learning_rate": 0.0001869608901877404, |
|
"loss": 2.0331, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.17294345204232564, |
|
"grad_norm": 0.4108966886997223, |
|
"learning_rate": 0.0001868180920098644, |
|
"loss": 2.0985, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.17385368073728524, |
|
"grad_norm": 0.505251944065094, |
|
"learning_rate": 0.00018667457130525284, |
|
"loss": 2.1995, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.17476390943224485, |
|
"grad_norm": 0.4776213467121124, |
|
"learning_rate": 0.00018653032926832896, |
|
"loss": 2.2254, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.17567413812720445, |
|
"grad_norm": 0.4876101016998291, |
|
"learning_rate": 0.00018638536709951917, |
|
"loss": 2.2885, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.17658436682216408, |
|
"grad_norm": 0.49354955554008484, |
|
"learning_rate": 0.000186239686005243, |
|
"loss": 2.2939, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.17749459551712368, |
|
"grad_norm": 0.5280092358589172, |
|
"learning_rate": 0.0001860932871979031, |
|
"loss": 2.3555, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.17840482421208329, |
|
"grad_norm": 0.5320273637771606, |
|
"learning_rate": 0.00018594617189587512, |
|
"loss": 2.2854, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.1793150529070429, |
|
"grad_norm": 0.6043667793273926, |
|
"learning_rate": 0.00018579834132349772, |
|
"loss": 2.6826, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.1802252816020025, |
|
"grad_norm": 0.6498908996582031, |
|
"learning_rate": 0.0001856497967110621, |
|
"loss": 2.5815, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.18113551029696212, |
|
"grad_norm": 0.8005975484848022, |
|
"learning_rate": 0.00018550053929480202, |
|
"loss": 2.7034, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.18204573899192172, |
|
"grad_norm": 1.9555100202560425, |
|
"learning_rate": 0.00018535057031688335, |
|
"loss": 2.9439, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.18295596768688133, |
|
"grad_norm": 0.65727299451828, |
|
"learning_rate": 0.0001851998910253939, |
|
"loss": 2.695, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.18386619638184093, |
|
"grad_norm": 0.6414365172386169, |
|
"learning_rate": 0.0001850485026743328, |
|
"loss": 2.5447, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.18477642507680053, |
|
"grad_norm": 0.5610425472259521, |
|
"learning_rate": 0.00018489640652360022, |
|
"loss": 2.4502, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.18568665377176016, |
|
"grad_norm": 0.4109022319316864, |
|
"learning_rate": 0.00018474360383898694, |
|
"loss": 2.5165, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.18659688246671977, |
|
"grad_norm": 0.4291420578956604, |
|
"learning_rate": 0.00018459009589216364, |
|
"loss": 2.4464, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.18750711116167937, |
|
"grad_norm": 0.44230878353118896, |
|
"learning_rate": 0.0001844358839606705, |
|
"loss": 2.4365, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.18841733985663897, |
|
"grad_norm": 0.4131537675857544, |
|
"learning_rate": 0.00018428096932790632, |
|
"loss": 2.5135, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.1893275685515986, |
|
"grad_norm": 0.41846412420272827, |
|
"learning_rate": 0.00018412535328311814, |
|
"loss": 2.3571, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.1902377972465582, |
|
"grad_norm": 0.4208964407444, |
|
"learning_rate": 0.0001839690371213903, |
|
"loss": 2.4642, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.1911480259415178, |
|
"grad_norm": 0.40538257360458374, |
|
"learning_rate": 0.0001838120221436338, |
|
"loss": 2.4407, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.1920582546364774, |
|
"grad_norm": 0.4170033633708954, |
|
"learning_rate": 0.00018365430965657526, |
|
"loss": 2.4844, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.192968483331437, |
|
"grad_norm": 0.3793712258338928, |
|
"learning_rate": 0.00018349590097274632, |
|
"loss": 2.2296, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.19387871202639664, |
|
"grad_norm": 0.38766035437583923, |
|
"learning_rate": 0.00018333679741047254, |
|
"loss": 2.2268, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.19478894072135625, |
|
"grad_norm": 0.3814389109611511, |
|
"learning_rate": 0.00018317700029386245, |
|
"loss": 2.0952, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.19569916941631585, |
|
"grad_norm": 0.4213506579399109, |
|
"learning_rate": 0.00018301651095279655, |
|
"loss": 2.3572, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.19660939811127545, |
|
"grad_norm": 0.41608527302742004, |
|
"learning_rate": 0.0001828553307229163, |
|
"loss": 2.2262, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.19751962680623505, |
|
"grad_norm": 0.3709307312965393, |
|
"learning_rate": 0.0001826934609456129, |
|
"loss": 2.2569, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.19842985550119469, |
|
"grad_norm": 0.400329053401947, |
|
"learning_rate": 0.00018253090296801614, |
|
"loss": 2.2614, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.1993400841961543, |
|
"grad_norm": 0.38953447341918945, |
|
"learning_rate": 0.0001823676581429833, |
|
"loss": 2.1335, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.2002503128911139, |
|
"grad_norm": 0.39479631185531616, |
|
"learning_rate": 0.00018220372782908777, |
|
"loss": 2.2293, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.2011605415860735, |
|
"grad_norm": 0.3911544382572174, |
|
"learning_rate": 0.00018203911339060783, |
|
"loss": 2.0099, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.2020707702810331, |
|
"grad_norm": 0.40512001514434814, |
|
"learning_rate": 0.00018187381619751516, |
|
"loss": 2.2282, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.20298099897599273, |
|
"grad_norm": 0.42339497804641724, |
|
"learning_rate": 0.00018170783762546365, |
|
"loss": 2.1722, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.20389122767095233, |
|
"grad_norm": 0.4218563139438629, |
|
"learning_rate": 0.00018154117905577776, |
|
"loss": 2.1781, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.20480145636591193, |
|
"grad_norm": 0.39831289649009705, |
|
"learning_rate": 0.00018137384187544116, |
|
"loss": 2.1663, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.20571168506087154, |
|
"grad_norm": 0.39963746070861816, |
|
"learning_rate": 0.00018120582747708502, |
|
"loss": 1.97, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.20662191375583117, |
|
"grad_norm": 0.38624411821365356, |
|
"learning_rate": 0.0001810371372589766, |
|
"loss": 1.9793, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.20753214245079077, |
|
"grad_norm": 0.4173285961151123, |
|
"learning_rate": 0.0001808677726250076, |
|
"loss": 2.1832, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.20844237114575037, |
|
"grad_norm": 0.393627405166626, |
|
"learning_rate": 0.00018069773498468223, |
|
"loss": 2.003, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.20935259984070997, |
|
"grad_norm": 0.4344862699508667, |
|
"learning_rate": 0.00018052702575310588, |
|
"loss": 2.235, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.21026282853566958, |
|
"grad_norm": 0.42050907015800476, |
|
"learning_rate": 0.00018035564635097298, |
|
"loss": 2.0272, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.2111730572306292, |
|
"grad_norm": 0.45956531167030334, |
|
"learning_rate": 0.00018018359820455536, |
|
"loss": 2.277, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.2120832859255888, |
|
"grad_norm": 0.4257940649986267, |
|
"learning_rate": 0.00018001088274569038, |
|
"loss": 2.1714, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.2129935146205484, |
|
"grad_norm": 0.4153771996498108, |
|
"learning_rate": 0.00017983750141176895, |
|
"loss": 2.0942, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.21390374331550802, |
|
"grad_norm": 0.44307154417037964, |
|
"learning_rate": 0.0001796634556457236, |
|
"loss": 2.1548, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.21481397201046762, |
|
"grad_norm": 0.45095738768577576, |
|
"learning_rate": 0.0001794887468960165, |
|
"loss": 2.1294, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.21572420070542725, |
|
"grad_norm": 0.4348011612892151, |
|
"learning_rate": 0.00017931337661662727, |
|
"loss": 2.1176, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.21663442940038685, |
|
"grad_norm": 0.46650028228759766, |
|
"learning_rate": 0.0001791373462670411, |
|
"loss": 2.1333, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.21754465809534645, |
|
"grad_norm": 0.49439772963523865, |
|
"learning_rate": 0.00017896065731223644, |
|
"loss": 2.1169, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.21845488679030606, |
|
"grad_norm": 0.4834192395210266, |
|
"learning_rate": 0.00017878331122267284, |
|
"loss": 2.3213, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.21936511548526566, |
|
"grad_norm": 0.49015435576438904, |
|
"learning_rate": 0.00017860530947427875, |
|
"loss": 2.3145, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.2202753441802253, |
|
"grad_norm": 0.4994793236255646, |
|
"learning_rate": 0.00017842665354843922, |
|
"loss": 2.1586, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.2211855728751849, |
|
"grad_norm": 0.5262652635574341, |
|
"learning_rate": 0.0001782473449319835, |
|
"loss": 2.3597, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.2220958015701445, |
|
"grad_norm": 0.5197060704231262, |
|
"learning_rate": 0.0001780673851171728, |
|
"loss": 2.2091, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.2230060302651041, |
|
"grad_norm": 0.5736340284347534, |
|
"learning_rate": 0.00017788677560168784, |
|
"loss": 2.489, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.22391625896006373, |
|
"grad_norm": 0.5602148771286011, |
|
"learning_rate": 0.0001777055178886162, |
|
"loss": 2.4159, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.22482648765502333, |
|
"grad_norm": 0.6122560501098633, |
|
"learning_rate": 0.0001775236134864401, |
|
"loss": 2.5168, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.22573671634998294, |
|
"grad_norm": 0.6337876915931702, |
|
"learning_rate": 0.00017734106390902366, |
|
"loss": 2.522, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.22664694504494254, |
|
"grad_norm": 0.8038336634635925, |
|
"learning_rate": 0.0001771578706756003, |
|
"loss": 2.5793, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.22755717373990214, |
|
"grad_norm": 1.3161811828613281, |
|
"learning_rate": 0.0001769740353107602, |
|
"loss": 2.3232, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.22846740243486177, |
|
"grad_norm": 0.5344598293304443, |
|
"learning_rate": 0.00017678955934443758, |
|
"loss": 2.5626, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.22937763112982137, |
|
"grad_norm": 0.5922054052352905, |
|
"learning_rate": 0.0001766044443118978, |
|
"loss": 2.5528, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.23028785982478098, |
|
"grad_norm": 0.4852460026741028, |
|
"learning_rate": 0.00017641869175372493, |
|
"loss": 2.4348, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.23119808851974058, |
|
"grad_norm": 0.40997761487960815, |
|
"learning_rate": 0.00017623230321580854, |
|
"loss": 2.5167, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.23210831721470018, |
|
"grad_norm": 0.40511611104011536, |
|
"learning_rate": 0.00017604528024933115, |
|
"loss": 2.479, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.2330185459096598, |
|
"grad_norm": 0.3989182114601135, |
|
"learning_rate": 0.00017585762441075503, |
|
"loss": 2.3482, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.23392877460461942, |
|
"grad_norm": 0.4280647039413452, |
|
"learning_rate": 0.00017566933726180964, |
|
"loss": 2.526, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.23483900329957902, |
|
"grad_norm": 0.43045175075531006, |
|
"learning_rate": 0.0001754804203694782, |
|
"loss": 2.4734, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.23574923199453862, |
|
"grad_norm": 0.41747456789016724, |
|
"learning_rate": 0.0001752908753059849, |
|
"loss": 2.3361, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.23665946068949822, |
|
"grad_norm": 0.4000650942325592, |
|
"learning_rate": 0.00017510070364878177, |
|
"loss": 2.4496, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.23756968938445785, |
|
"grad_norm": 0.3866179287433624, |
|
"learning_rate": 0.00017490990698053563, |
|
"loss": 2.3222, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.23847991807941746, |
|
"grad_norm": 0.4083816409111023, |
|
"learning_rate": 0.00017471848688911464, |
|
"loss": 2.3619, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.23939014677437706, |
|
"grad_norm": 0.4181024432182312, |
|
"learning_rate": 0.0001745264449675755, |
|
"loss": 2.3515, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.24030037546933666, |
|
"grad_norm": 0.3845669627189636, |
|
"learning_rate": 0.00017433378281414975, |
|
"loss": 2.3866, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.24121060416429627, |
|
"grad_norm": 0.3917085826396942, |
|
"learning_rate": 0.0001741405020322309, |
|
"loss": 2.4155, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.2421208328592559, |
|
"grad_norm": 0.3883097171783447, |
|
"learning_rate": 0.00017394660423036075, |
|
"loss": 2.1034, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.2430310615542155, |
|
"grad_norm": 0.37689700722694397, |
|
"learning_rate": 0.00017375209102221613, |
|
"loss": 2.1676, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.2439412902491751, |
|
"grad_norm": 0.38591262698173523, |
|
"learning_rate": 0.00017355696402659548, |
|
"loss": 2.3066, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.2448515189441347, |
|
"grad_norm": 0.3865531384944916, |
|
"learning_rate": 0.00017336122486740548, |
|
"loss": 2.1387, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.24576174763909434, |
|
"grad_norm": 0.38778188824653625, |
|
"learning_rate": 0.00017316487517364721, |
|
"loss": 2.2369, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.24667197633405394, |
|
"grad_norm": 0.38550639152526855, |
|
"learning_rate": 0.000172967916579403, |
|
"loss": 2.2013, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.24758220502901354, |
|
"grad_norm": 0.41831621527671814, |
|
"learning_rate": 0.00017277035072382253, |
|
"loss": 2.1315, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.24849243372397314, |
|
"grad_norm": 0.386983722448349, |
|
"learning_rate": 0.00017257217925110933, |
|
"loss": 2.1582, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.24940266241893275, |
|
"grad_norm": 0.39023295044898987, |
|
"learning_rate": 0.00017237340381050703, |
|
"loss": 2.0061, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.2503128911138924, |
|
"grad_norm": 0.4187031686306, |
|
"learning_rate": 0.00017217402605628572, |
|
"loss": 2.0814, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.2503128911138924, |
|
"eval_loss": 2.2949306964874268, |
|
"eval_runtime": 205.2875, |
|
"eval_samples_per_second": 9.017, |
|
"eval_steps_per_second": 4.511, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.251223119808852, |
|
"grad_norm": 0.39927324652671814, |
|
"learning_rate": 0.00017197404764772805, |
|
"loss": 2.1982, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.2521333485038116, |
|
"grad_norm": 0.4287830889225006, |
|
"learning_rate": 0.00017177347024911562, |
|
"loss": 2.2733, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.2530435771987712, |
|
"grad_norm": 0.3960012197494507, |
|
"learning_rate": 0.00017157229552971487, |
|
"loss": 2.1884, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.2539538058937308, |
|
"grad_norm": 0.40106138586997986, |
|
"learning_rate": 0.00017137052516376345, |
|
"loss": 2.1207, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.2548640345886904, |
|
"grad_norm": 0.4410867393016815, |
|
"learning_rate": 0.00017116816083045602, |
|
"loss": 2.3589, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.25577426328365, |
|
"grad_norm": 0.4092939794063568, |
|
"learning_rate": 0.0001709652042139306, |
|
"loss": 2.0842, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.25668449197860965, |
|
"grad_norm": 0.40820494294166565, |
|
"learning_rate": 0.0001707616570032542, |
|
"loss": 2.1658, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.25759472067356926, |
|
"grad_norm": 0.41664186120033264, |
|
"learning_rate": 0.00017055752089240907, |
|
"loss": 2.1389, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.25850494936852886, |
|
"grad_norm": 0.4125240445137024, |
|
"learning_rate": 0.00017035279758027832, |
|
"loss": 2.0615, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.25941517806348846, |
|
"grad_norm": 0.42702898383140564, |
|
"learning_rate": 0.00017014748877063214, |
|
"loss": 2.017, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.26032540675844806, |
|
"grad_norm": 0.44943541288375854, |
|
"learning_rate": 0.00016994159617211317, |
|
"loss": 2.1901, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.26123563545340767, |
|
"grad_norm": 0.4286860227584839, |
|
"learning_rate": 0.00016973512149822274, |
|
"loss": 2.0643, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.26214586414836727, |
|
"grad_norm": 0.44938111305236816, |
|
"learning_rate": 0.0001695280664673062, |
|
"loss": 2.1539, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.26305609284332687, |
|
"grad_norm": 0.4638296067714691, |
|
"learning_rate": 0.0001693204328025389, |
|
"loss": 2.291, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.2639663215382865, |
|
"grad_norm": 0.49295714497566223, |
|
"learning_rate": 0.00016911222223191182, |
|
"loss": 2.2538, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.2648765502332461, |
|
"grad_norm": 0.48185715079307556, |
|
"learning_rate": 0.00016890343648821697, |
|
"loss": 2.2792, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.26578677892820574, |
|
"grad_norm": 0.4750272035598755, |
|
"learning_rate": 0.0001686940773090333, |
|
"loss": 2.2774, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.26669700762316534, |
|
"grad_norm": 0.5073033571243286, |
|
"learning_rate": 0.00016848414643671195, |
|
"loss": 2.3261, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.26760723631812494, |
|
"grad_norm": 0.5343424081802368, |
|
"learning_rate": 0.00016827364561836187, |
|
"loss": 2.4097, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.26851746501308454, |
|
"grad_norm": 0.5311369895935059, |
|
"learning_rate": 0.00016806257660583534, |
|
"loss": 2.3821, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.26942769370804415, |
|
"grad_norm": 0.5551429986953735, |
|
"learning_rate": 0.00016785094115571322, |
|
"loss": 2.3795, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.27033792240300375, |
|
"grad_norm": 0.6279783248901367, |
|
"learning_rate": 0.0001676387410292906, |
|
"loss": 2.435, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.27124815109796335, |
|
"grad_norm": 0.7317250967025757, |
|
"learning_rate": 0.00016742597799256182, |
|
"loss": 2.6991, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.27215837979292296, |
|
"grad_norm": 0.8485302329063416, |
|
"learning_rate": 0.000167212653816206, |
|
"loss": 2.7005, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.27306860848788256, |
|
"grad_norm": 1.5959185361862183, |
|
"learning_rate": 0.00016699877027557226, |
|
"loss": 2.7536, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.2739788371828422, |
|
"grad_norm": 0.4755174219608307, |
|
"learning_rate": 0.00016678432915066488, |
|
"loss": 2.5907, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.2748890658778018, |
|
"grad_norm": 0.45389342308044434, |
|
"learning_rate": 0.00016656933222612854, |
|
"loss": 2.4622, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.2757992945727614, |
|
"grad_norm": 0.4949435591697693, |
|
"learning_rate": 0.00016635378129123342, |
|
"loss": 2.4185, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.276709523267721, |
|
"grad_norm": 0.4521631896495819, |
|
"learning_rate": 0.00016613767813986044, |
|
"loss": 2.4918, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.2776197519626806, |
|
"grad_norm": 0.4228963553905487, |
|
"learning_rate": 0.0001659210245704861, |
|
"loss": 2.4194, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.27852998065764023, |
|
"grad_norm": 0.4170341491699219, |
|
"learning_rate": 0.00016570382238616777, |
|
"loss": 2.4185, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.27944020935259983, |
|
"grad_norm": 0.4123315215110779, |
|
"learning_rate": 0.00016548607339452853, |
|
"loss": 2.3737, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.28035043804755944, |
|
"grad_norm": 0.4320162832736969, |
|
"learning_rate": 0.00016526777940774204, |
|
"loss": 2.3317, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.28126066674251904, |
|
"grad_norm": 0.4118390381336212, |
|
"learning_rate": 0.00016504894224251778, |
|
"loss": 2.3786, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.28217089543747864, |
|
"grad_norm": 0.39763331413269043, |
|
"learning_rate": 0.0001648295637200856, |
|
"loss": 2.2968, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.2830811241324383, |
|
"grad_norm": 0.4391527473926544, |
|
"learning_rate": 0.0001646096456661807, |
|
"loss": 2.3764, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.2839913528273979, |
|
"grad_norm": 0.43077877163887024, |
|
"learning_rate": 0.00016438918991102842, |
|
"loss": 2.2013, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.2849015815223575, |
|
"grad_norm": 0.43149155378341675, |
|
"learning_rate": 0.000164168198289329, |
|
"loss": 2.3097, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.2858118102173171, |
|
"grad_norm": 0.40134817361831665, |
|
"learning_rate": 0.00016394667264024246, |
|
"loss": 2.3306, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.2867220389122767, |
|
"grad_norm": 0.4056681990623474, |
|
"learning_rate": 0.00016372461480737297, |
|
"loss": 2.3146, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.2876322676072363, |
|
"grad_norm": 0.41738027334213257, |
|
"learning_rate": 0.00016350202663875386, |
|
"loss": 1.9997, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.2885424963021959, |
|
"grad_norm": 0.38182246685028076, |
|
"learning_rate": 0.00016327890998683192, |
|
"loss": 2.0466, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.2894527249971555, |
|
"grad_norm": 0.39759719371795654, |
|
"learning_rate": 0.00016305526670845226, |
|
"loss": 2.1788, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.2903629536921151, |
|
"grad_norm": 0.3982352614402771, |
|
"learning_rate": 0.0001628310986648427, |
|
"loss": 2.2115, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.2912731823870747, |
|
"grad_norm": 0.41679051518440247, |
|
"learning_rate": 0.0001626064077215983, |
|
"loss": 2.3036, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.2921834110820344, |
|
"grad_norm": 0.40436604619026184, |
|
"learning_rate": 0.00016238119574866588, |
|
"loss": 2.1493, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.293093639776994, |
|
"grad_norm": 0.4502476751804352, |
|
"learning_rate": 0.0001621554646203284, |
|
"loss": 1.8572, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.2940038684719536, |
|
"grad_norm": 0.44303473830223083, |
|
"learning_rate": 0.00016192921621518944, |
|
"loss": 2.1832, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.2949140971669132, |
|
"grad_norm": 0.4064692258834839, |
|
"learning_rate": 0.0001617024524161574, |
|
"loss": 2.2656, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.2958243258618728, |
|
"grad_norm": 0.4479392170906067, |
|
"learning_rate": 0.0001614751751104301, |
|
"loss": 2.2462, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.2967345545568324, |
|
"grad_norm": 0.4629363715648651, |
|
"learning_rate": 0.0001612473861894788, |
|
"loss": 1.9715, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.297644783251792, |
|
"grad_norm": 0.3991665542125702, |
|
"learning_rate": 0.00016101908754903268, |
|
"loss": 2.0642, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.2985550119467516, |
|
"grad_norm": 0.42503711581230164, |
|
"learning_rate": 0.00016079028108906282, |
|
"loss": 2.1403, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.2994652406417112, |
|
"grad_norm": 0.4499455392360687, |
|
"learning_rate": 0.00016056096871376667, |
|
"loss": 2.0534, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.30037546933667086, |
|
"grad_norm": 0.4549277424812317, |
|
"learning_rate": 0.00016033115233155202, |
|
"loss": 2.2083, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.30128569803163047, |
|
"grad_norm": 0.3974262773990631, |
|
"learning_rate": 0.0001601008338550211, |
|
"loss": 2.0156, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.30219592672659007, |
|
"grad_norm": 0.43566057085990906, |
|
"learning_rate": 0.00015987001520095478, |
|
"loss": 2.1801, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.3031061554215497, |
|
"grad_norm": 0.47677701711654663, |
|
"learning_rate": 0.00015963869829029658, |
|
"loss": 2.1415, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.3040163841165093, |
|
"grad_norm": 0.4603672921657562, |
|
"learning_rate": 0.00015940688504813662, |
|
"loss": 2.2967, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.3049266128114689, |
|
"grad_norm": 0.4428515136241913, |
|
"learning_rate": 0.00015917457740369565, |
|
"loss": 2.1447, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.3058368415064285, |
|
"grad_norm": 0.4379275441169739, |
|
"learning_rate": 0.000158941777290309, |
|
"loss": 2.0957, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.3067470702013881, |
|
"grad_norm": 0.4831966459751129, |
|
"learning_rate": 0.00015870848664541044, |
|
"loss": 2.2457, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.3076572988963477, |
|
"grad_norm": 0.45160865783691406, |
|
"learning_rate": 0.00015847470741051618, |
|
"loss": 2.1441, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.3085675275913073, |
|
"grad_norm": 0.44453370571136475, |
|
"learning_rate": 0.00015824044153120852, |
|
"loss": 2.1073, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.30947775628626695, |
|
"grad_norm": 0.49965375661849976, |
|
"learning_rate": 0.00015800569095711982, |
|
"loss": 2.1574, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.31038798498122655, |
|
"grad_norm": 0.48138341307640076, |
|
"learning_rate": 0.00015777045764191625, |
|
"loss": 2.0205, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.31129821367618615, |
|
"grad_norm": 0.5034924745559692, |
|
"learning_rate": 0.00015753474354328142, |
|
"loss": 2.2319, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.31220844237114576, |
|
"grad_norm": 0.5034711956977844, |
|
"learning_rate": 0.00015729855062290022, |
|
"loss": 2.4066, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.31311867106610536, |
|
"grad_norm": 0.5409703254699707, |
|
"learning_rate": 0.00015706188084644242, |
|
"loss": 2.2435, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.31402889976106496, |
|
"grad_norm": 0.544597327709198, |
|
"learning_rate": 0.00015682473618354635, |
|
"loss": 2.2625, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.31493912845602456, |
|
"grad_norm": 0.6114000082015991, |
|
"learning_rate": 0.0001565871186078025, |
|
"loss": 2.4302, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.31584935715098417, |
|
"grad_norm": 0.6364843845367432, |
|
"learning_rate": 0.00015634903009673705, |
|
"loss": 2.5153, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.31675958584594377, |
|
"grad_norm": 0.7510351538658142, |
|
"learning_rate": 0.00015611047263179548, |
|
"loss": 2.5605, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.31766981454090343, |
|
"grad_norm": 0.8501291275024414, |
|
"learning_rate": 0.000155871448198326, |
|
"loss": 2.6519, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.31858004323586303, |
|
"grad_norm": 1.7441632747650146, |
|
"learning_rate": 0.0001556319587855631, |
|
"loss": 2.7517, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.31949027193082263, |
|
"grad_norm": 0.5301811695098877, |
|
"learning_rate": 0.00015539200638661104, |
|
"loss": 2.6647, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.32040050062578224, |
|
"grad_norm": 0.5063616633415222, |
|
"learning_rate": 0.00015515159299842707, |
|
"loss": 2.4961, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.32131072932074184, |
|
"grad_norm": 0.4843781590461731, |
|
"learning_rate": 0.00015491072062180503, |
|
"loss": 2.496, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.32222095801570144, |
|
"grad_norm": 0.4524553716182709, |
|
"learning_rate": 0.00015466939126135856, |
|
"loss": 2.448, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.32313118671066104, |
|
"grad_norm": 0.43678200244903564, |
|
"learning_rate": 0.00015442760692550443, |
|
"loss": 2.2687, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.32404141540562065, |
|
"grad_norm": 0.4301970303058624, |
|
"learning_rate": 0.00015418536962644592, |
|
"loss": 2.4826, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.32495164410058025, |
|
"grad_norm": 0.42540326714515686, |
|
"learning_rate": 0.00015394268138015598, |
|
"loss": 2.4205, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.32586187279553985, |
|
"grad_norm": 0.4173906445503235, |
|
"learning_rate": 0.00015369954420636048, |
|
"loss": 2.394, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.3267721014904995, |
|
"grad_norm": 0.43184736371040344, |
|
"learning_rate": 0.00015345596012852138, |
|
"loss": 2.3504, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.3276823301854591, |
|
"grad_norm": 0.4002053141593933, |
|
"learning_rate": 0.00015321193117381996, |
|
"loss": 2.2951, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.3285925588804187, |
|
"grad_norm": 0.39067134261131287, |
|
"learning_rate": 0.00015296745937313987, |
|
"loss": 2.2768, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.3295027875753783, |
|
"grad_norm": 0.40051525831222534, |
|
"learning_rate": 0.00015272254676105025, |
|
"loss": 2.2235, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.3304130162703379, |
|
"grad_norm": 0.3954068422317505, |
|
"learning_rate": 0.00015247719537578883, |
|
"loss": 2.2502, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.3313232449652975, |
|
"grad_norm": 0.4123362600803375, |
|
"learning_rate": 0.00015223140725924495, |
|
"loss": 2.3309, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.33223347366025713, |
|
"grad_norm": 0.4138774871826172, |
|
"learning_rate": 0.00015198518445694255, |
|
"loss": 2.4107, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.33314370235521673, |
|
"grad_norm": 0.3983847498893738, |
|
"learning_rate": 0.0001517385290180231, |
|
"loss": 2.2718, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.33405393105017633, |
|
"grad_norm": 0.36962834000587463, |
|
"learning_rate": 0.00015149144299522873, |
|
"loss": 2.1744, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.334964159745136, |
|
"grad_norm": 0.37924104928970337, |
|
"learning_rate": 0.0001512439284448849, |
|
"loss": 2.1451, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.3358743884400956, |
|
"grad_norm": 0.39990487694740295, |
|
"learning_rate": 0.0001509959874268835, |
|
"loss": 2.2508, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.3367846171350552, |
|
"grad_norm": 0.3862214684486389, |
|
"learning_rate": 0.00015074762200466556, |
|
"loss": 2.1483, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.3376948458300148, |
|
"grad_norm": 0.4037676751613617, |
|
"learning_rate": 0.00015049883424520414, |
|
"loss": 2.2179, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.3386050745249744, |
|
"grad_norm": 0.40439948439598083, |
|
"learning_rate": 0.00015024962621898715, |
|
"loss": 2.2054, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.339515303219934, |
|
"grad_norm": 0.3871942460536957, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 2.129, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.3404255319148936, |
|
"grad_norm": 0.4091387093067169, |
|
"learning_rate": 0.00014974995766570855, |
|
"loss": 2.1395, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.3413357606098532, |
|
"grad_norm": 0.4097527265548706, |
|
"learning_rate": 0.00014949950129704162, |
|
"loss": 2.1789, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.3422459893048128, |
|
"grad_norm": 0.4139934480190277, |
|
"learning_rate": 0.00014924863297837378, |
|
"loss": 2.0611, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.3431562179997724, |
|
"grad_norm": 0.4146927297115326, |
|
"learning_rate": 0.00014899735479750794, |
|
"loss": 2.2488, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.3440664466947321, |
|
"grad_norm": 0.4194958209991455, |
|
"learning_rate": 0.00014874566884565807, |
|
"loss": 2.0164, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.3449766753896917, |
|
"grad_norm": 0.41280898451805115, |
|
"learning_rate": 0.00014849357721743168, |
|
"loss": 2.1503, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.3458869040846513, |
|
"grad_norm": 0.4133208692073822, |
|
"learning_rate": 0.00014824108201081247, |
|
"loss": 2.0895, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.3467971327796109, |
|
"grad_norm": 0.41347819566726685, |
|
"learning_rate": 0.00014798818532714279, |
|
"loss": 2.0479, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.3477073614745705, |
|
"grad_norm": 0.43102580308914185, |
|
"learning_rate": 0.00014773488927110633, |
|
"loss": 2.1458, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.3486175901695301, |
|
"grad_norm": 0.41427451372146606, |
|
"learning_rate": 0.00014748119595071034, |
|
"loss": 1.9396, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.3495278188644897, |
|
"grad_norm": 0.46386152505874634, |
|
"learning_rate": 0.0001472271074772683, |
|
"loss": 2.2446, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.3504380475594493, |
|
"grad_norm": 0.4310764670372009, |
|
"learning_rate": 0.00014697262596538227, |
|
"loss": 2.2144, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.3513482762544089, |
|
"grad_norm": 0.4956878423690796, |
|
"learning_rate": 0.00014671775353292525, |
|
"loss": 2.1875, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.35225850494936856, |
|
"grad_norm": 0.4793931543827057, |
|
"learning_rate": 0.00014646249230102366, |
|
"loss": 2.2733, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.35316873364432816, |
|
"grad_norm": 0.46217313408851624, |
|
"learning_rate": 0.00014620684439403962, |
|
"loss": 2.2812, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.35407896233928776, |
|
"grad_norm": 0.4721885323524475, |
|
"learning_rate": 0.00014595081193955324, |
|
"loss": 2.1223, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.35498919103424736, |
|
"grad_norm": 0.49550965428352356, |
|
"learning_rate": 0.000145694397068345, |
|
"loss": 2.156, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.35589941972920697, |
|
"grad_norm": 0.5109139084815979, |
|
"learning_rate": 0.0001454376019143779, |
|
"loss": 2.1494, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.35680964842416657, |
|
"grad_norm": 0.4725574553012848, |
|
"learning_rate": 0.00014518042861477986, |
|
"loss": 2.1793, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.3577198771191262, |
|
"grad_norm": 0.4739914536476135, |
|
"learning_rate": 0.00014492287930982576, |
|
"loss": 2.1763, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.3586301058140858, |
|
"grad_norm": 0.5420114994049072, |
|
"learning_rate": 0.00014466495614291977, |
|
"loss": 2.4521, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.3595403345090454, |
|
"grad_norm": 0.5225427150726318, |
|
"learning_rate": 0.00014440666126057744, |
|
"loss": 2.372, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.360450563204005, |
|
"grad_norm": 0.5337964296340942, |
|
"learning_rate": 0.0001441479968124078, |
|
"loss": 2.397, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.36136079189896464, |
|
"grad_norm": 0.5906230807304382, |
|
"learning_rate": 0.0001438889649510956, |
|
"loss": 2.506, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.36227102059392424, |
|
"grad_norm": 0.6578875780105591, |
|
"learning_rate": 0.00014362956783238324, |
|
"loss": 2.6408, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.36318124928888385, |
|
"grad_norm": 0.7982918620109558, |
|
"learning_rate": 0.00014336980761505297, |
|
"loss": 2.6612, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.36409147798384345, |
|
"grad_norm": 1.4390262365341187, |
|
"learning_rate": 0.00014310968646090883, |
|
"loss": 2.7073, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.36500170667880305, |
|
"grad_norm": 0.5260487198829651, |
|
"learning_rate": 0.00014284920653475866, |
|
"loss": 2.6269, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.36591193537376265, |
|
"grad_norm": 0.4492892026901245, |
|
"learning_rate": 0.00014258837000439618, |
|
"loss": 2.3863, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.36682216406872226, |
|
"grad_norm": 0.4619944095611572, |
|
"learning_rate": 0.0001423271790405828, |
|
"loss": 2.4595, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.36773239276368186, |
|
"grad_norm": 0.4437786638736725, |
|
"learning_rate": 0.00014206563581702964, |
|
"loss": 2.3674, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.36864262145864146, |
|
"grad_norm": 0.4789164364337921, |
|
"learning_rate": 0.0001418037425103795, |
|
"loss": 2.5203, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.36955285015360106, |
|
"grad_norm": 0.44783228635787964, |
|
"learning_rate": 0.00014154150130018866, |
|
"loss": 2.5183, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.3704630788485607, |
|
"grad_norm": 0.40067169070243835, |
|
"learning_rate": 0.00014127891436890868, |
|
"loss": 2.3846, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.3713733075435203, |
|
"grad_norm": 0.3978015184402466, |
|
"learning_rate": 0.0001410159839018684, |
|
"loss": 2.3146, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.37228353623847993, |
|
"grad_norm": 0.4096076190471649, |
|
"learning_rate": 0.0001407527120872557, |
|
"loss": 2.3617, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.37319376493343953, |
|
"grad_norm": 0.4160764217376709, |
|
"learning_rate": 0.00014048910111609915, |
|
"loss": 2.2909, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.37410399362839913, |
|
"grad_norm": 0.3976461887359619, |
|
"learning_rate": 0.0001402251531822499, |
|
"loss": 2.3111, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.37501422232335874, |
|
"grad_norm": 0.3890199065208435, |
|
"learning_rate": 0.00013996087048236358, |
|
"loss": 2.0969, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.37592445101831834, |
|
"grad_norm": 0.4157082140445709, |
|
"learning_rate": 0.00013969625521588158, |
|
"loss": 2.3205, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.37683467971327794, |
|
"grad_norm": 0.4103608727455139, |
|
"learning_rate": 0.00013943130958501317, |
|
"loss": 2.2622, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.37774490840823755, |
|
"grad_norm": 0.40916207432746887, |
|
"learning_rate": 0.00013916603579471705, |
|
"loss": 2.3585, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.3786551371031972, |
|
"grad_norm": 0.39642858505249023, |
|
"learning_rate": 0.00013890043605268283, |
|
"loss": 2.2196, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.3795653657981568, |
|
"grad_norm": 0.3851282596588135, |
|
"learning_rate": 0.00013863451256931287, |
|
"loss": 2.0298, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.3804755944931164, |
|
"grad_norm": 0.38890305161476135, |
|
"learning_rate": 0.00013836826755770384, |
|
"loss": 2.1601, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.381385823188076, |
|
"grad_norm": 0.41382652521133423, |
|
"learning_rate": 0.00013810170323362816, |
|
"loss": 2.2656, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.3822960518830356, |
|
"grad_norm": 0.3820722699165344, |
|
"learning_rate": 0.0001378348218155158, |
|
"loss": 2.0094, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.3832062805779952, |
|
"grad_norm": 0.4150048494338989, |
|
"learning_rate": 0.00013756762552443553, |
|
"loss": 2.2529, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.3841165092729548, |
|
"grad_norm": 0.452776700258255, |
|
"learning_rate": 0.00013730011658407676, |
|
"loss": 2.1972, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.3850267379679144, |
|
"grad_norm": 0.4173040986061096, |
|
"learning_rate": 0.00013703229722073065, |
|
"loss": 2.1502, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.385936966662874, |
|
"grad_norm": 0.4115488529205322, |
|
"learning_rate": 0.000136764169663272, |
|
"loss": 1.9828, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.38684719535783363, |
|
"grad_norm": 0.4060666561126709, |
|
"learning_rate": 0.00013649573614314044, |
|
"loss": 2.267, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.3877574240527933, |
|
"grad_norm": 0.4049409031867981, |
|
"learning_rate": 0.00013622699889432184, |
|
"loss": 2.2044, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.3886676527477529, |
|
"grad_norm": 0.40970832109451294, |
|
"learning_rate": 0.00013595796015332984, |
|
"loss": 2.0984, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.3895778814427125, |
|
"grad_norm": 0.4141111671924591, |
|
"learning_rate": 0.00013568862215918717, |
|
"loss": 2.109, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.3904881101376721, |
|
"grad_norm": 0.43404263257980347, |
|
"learning_rate": 0.00013541898715340716, |
|
"loss": 2.1763, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.3913983388326317, |
|
"grad_norm": 0.41949963569641113, |
|
"learning_rate": 0.00013514905737997473, |
|
"loss": 2.3086, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.3923085675275913, |
|
"grad_norm": 0.41665390133857727, |
|
"learning_rate": 0.00013487883508532815, |
|
"loss": 2.0726, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.3932187962225509, |
|
"grad_norm": 0.4305708110332489, |
|
"learning_rate": 0.00013460832251834011, |
|
"loss": 2.1975, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.3941290249175105, |
|
"grad_norm": 0.44775405526161194, |
|
"learning_rate": 0.00013433752193029886, |
|
"loss": 2.1503, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.3950392536124701, |
|
"grad_norm": 0.44451820850372314, |
|
"learning_rate": 0.0001340664355748899, |
|
"loss": 2.1004, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.39594948230742977, |
|
"grad_norm": 0.44242945313453674, |
|
"learning_rate": 0.0001337950657081768, |
|
"loss": 2.1074, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.39685971100238937, |
|
"grad_norm": 0.4649699926376343, |
|
"learning_rate": 0.00013352341458858265, |
|
"loss": 2.2468, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.397769939697349, |
|
"grad_norm": 0.4718558192253113, |
|
"learning_rate": 0.00013325148447687125, |
|
"loss": 2.225, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.3986801683923086, |
|
"grad_norm": 0.44748789072036743, |
|
"learning_rate": 0.0001329792776361282, |
|
"loss": 2.0243, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.3995903970872682, |
|
"grad_norm": 0.4730619192123413, |
|
"learning_rate": 0.00013270679633174218, |
|
"loss": 2.0262, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.4005006257822278, |
|
"grad_norm": 0.4742071032524109, |
|
"learning_rate": 0.00013243404283138597, |
|
"loss": 2.1171, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.4014108544771874, |
|
"grad_norm": 0.4963454306125641, |
|
"learning_rate": 0.00013216101940499768, |
|
"loss": 2.051, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.402321083172147, |
|
"grad_norm": 0.5127780437469482, |
|
"learning_rate": 0.00013188772832476188, |
|
"loss": 2.1664, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.4032313118671066, |
|
"grad_norm": 0.5129209756851196, |
|
"learning_rate": 0.00013161417186509052, |
|
"loss": 2.2272, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.4041415405620662, |
|
"grad_norm": 0.5068848133087158, |
|
"learning_rate": 0.00013134035230260427, |
|
"loss": 2.1007, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.40505176925702585, |
|
"grad_norm": 0.5721228718757629, |
|
"learning_rate": 0.00013106627191611332, |
|
"loss": 2.255, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.40596199795198545, |
|
"grad_norm": 0.6085918545722961, |
|
"learning_rate": 0.0001307919329865985, |
|
"loss": 2.456, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.40687222664694506, |
|
"grad_norm": 0.6652196645736694, |
|
"learning_rate": 0.00013051733779719234, |
|
"loss": 2.5504, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.40778245534190466, |
|
"grad_norm": 0.7234418392181396, |
|
"learning_rate": 0.00013024248863316012, |
|
"loss": 2.5796, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.40869268403686426, |
|
"grad_norm": 0.8588744401931763, |
|
"learning_rate": 0.00012996738778188067, |
|
"loss": 2.5756, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.40960291273182386, |
|
"grad_norm": 1.2627683877944946, |
|
"learning_rate": 0.0001296920375328275, |
|
"loss": 2.203, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.41051314142678347, |
|
"grad_norm": 0.4838164746761322, |
|
"learning_rate": 0.00012941644017754964, |
|
"loss": 2.434, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.41142337012174307, |
|
"grad_norm": 0.44005534052848816, |
|
"learning_rate": 0.00012914059800965268, |
|
"loss": 2.55, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.4123335988167027, |
|
"grad_norm": 0.4343414604663849, |
|
"learning_rate": 0.0001288645133247795, |
|
"loss": 2.432, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.41324382751166233, |
|
"grad_norm": 0.4588654339313507, |
|
"learning_rate": 0.00012858818842059145, |
|
"loss": 2.4434, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.41415405620662193, |
|
"grad_norm": 0.4294244647026062, |
|
"learning_rate": 0.00012831162559674887, |
|
"loss": 2.4241, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.41506428490158154, |
|
"grad_norm": 0.40034809708595276, |
|
"learning_rate": 0.0001280348271548923, |
|
"loss": 2.3191, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.41597451359654114, |
|
"grad_norm": 0.40817153453826904, |
|
"learning_rate": 0.00012775779539862304, |
|
"loss": 2.589, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.41688474229150074, |
|
"grad_norm": 0.40605810284614563, |
|
"learning_rate": 0.0001274805326334842, |
|
"loss": 2.3445, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.41779497098646035, |
|
"grad_norm": 0.4386533200740814, |
|
"learning_rate": 0.00012720304116694138, |
|
"loss": 2.4002, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.41870519968141995, |
|
"grad_norm": 0.40985172986984253, |
|
"learning_rate": 0.00012692532330836346, |
|
"loss": 2.3964, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.41961542837637955, |
|
"grad_norm": 0.4220562279224396, |
|
"learning_rate": 0.00012664738136900348, |
|
"loss": 2.3145, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.42052565707133915, |
|
"grad_norm": 0.4068267047405243, |
|
"learning_rate": 0.00012636921766197943, |
|
"loss": 2.3274, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.42143588576629876, |
|
"grad_norm": 0.3973187208175659, |
|
"learning_rate": 0.0001260908345022547, |
|
"loss": 2.1801, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.4223461144612584, |
|
"grad_norm": 0.432224303483963, |
|
"learning_rate": 0.00012581223420661913, |
|
"loss": 2.4079, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.423256343156218, |
|
"grad_norm": 0.3939046859741211, |
|
"learning_rate": 0.00012553341909366978, |
|
"loss": 2.0749, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.4241665718511776, |
|
"grad_norm": 0.36949658393859863, |
|
"learning_rate": 0.00012525439148379128, |
|
"loss": 2.1471, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.4250768005461372, |
|
"grad_norm": 0.3828236758708954, |
|
"learning_rate": 0.00012497515369913685, |
|
"loss": 2.0466, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.4259870292410968, |
|
"grad_norm": 0.3874993920326233, |
|
"learning_rate": 0.00012469570806360875, |
|
"loss": 2.1605, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.42689725793605643, |
|
"grad_norm": 0.3854924738407135, |
|
"learning_rate": 0.00012441605690283915, |
|
"loss": 2.0584, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.42780748663101603, |
|
"grad_norm": 0.40301740169525146, |
|
"learning_rate": 0.00012413620254417057, |
|
"loss": 2.1481, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.42871771532597563, |
|
"grad_norm": 0.3891369104385376, |
|
"learning_rate": 0.00012385614731663666, |
|
"loss": 2.1968, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.42962794402093524, |
|
"grad_norm": 0.4305795729160309, |
|
"learning_rate": 0.00012357589355094275, |
|
"loss": 2.0421, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.4305381727158949, |
|
"grad_norm": 0.44661635160446167, |
|
"learning_rate": 0.0001232954435794464, |
|
"loss": 2.3347, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.4314484014108545, |
|
"grad_norm": 0.3984116315841675, |
|
"learning_rate": 0.00012301479973613822, |
|
"loss": 2.1093, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.4323586301058141, |
|
"grad_norm": 0.4153747856616974, |
|
"learning_rate": 0.00012273396435662212, |
|
"loss": 2.0698, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.4332688588007737, |
|
"grad_norm": 0.4589189887046814, |
|
"learning_rate": 0.00012245293977809605, |
|
"loss": 2.1707, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.4341790874957333, |
|
"grad_norm": 0.43936577439308167, |
|
"learning_rate": 0.0001221717283393326, |
|
"loss": 2.2608, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.4350893161906929, |
|
"grad_norm": 0.4170132279396057, |
|
"learning_rate": 0.0001218903323806595, |
|
"loss": 2.0813, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.4359995448856525, |
|
"grad_norm": 0.43124523758888245, |
|
"learning_rate": 0.00012160875424393996, |
|
"loss": 2.1674, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.4369097735806121, |
|
"grad_norm": 0.4394627511501312, |
|
"learning_rate": 0.00012132699627255347, |
|
"loss": 2.1904, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.4378200022755717, |
|
"grad_norm": 0.4404590427875519, |
|
"learning_rate": 0.00012104506081137608, |
|
"loss": 2.1313, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.4387302309705313, |
|
"grad_norm": 0.4580220878124237, |
|
"learning_rate": 0.00012076295020676103, |
|
"loss": 2.16, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.439640459665491, |
|
"grad_norm": 0.4533630311489105, |
|
"learning_rate": 0.00012048066680651908, |
|
"loss": 2.1153, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.4405506883604506, |
|
"grad_norm": 0.47520536184310913, |
|
"learning_rate": 0.00012019821295989912, |
|
"loss": 2.2152, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.4414609170554102, |
|
"grad_norm": 0.44196072220802307, |
|
"learning_rate": 0.00011991559101756852, |
|
"loss": 2.1375, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.4423711457503698, |
|
"grad_norm": 0.43681493401527405, |
|
"learning_rate": 0.00011963280333159358, |
|
"loss": 2.0552, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.4432813744453294, |
|
"grad_norm": 0.4537602961063385, |
|
"learning_rate": 0.00011934985225541998, |
|
"loss": 2.1473, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.444191603140289, |
|
"grad_norm": 0.4935773015022278, |
|
"learning_rate": 0.00011906674014385318, |
|
"loss": 2.0623, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.4451018318352486, |
|
"grad_norm": 0.4802737236022949, |
|
"learning_rate": 0.00011878346935303883, |
|
"loss": 2.2908, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.4460120605302082, |
|
"grad_norm": 0.5020537376403809, |
|
"learning_rate": 0.00011850004224044315, |
|
"loss": 2.3101, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.4469222892251678, |
|
"grad_norm": 0.5106056332588196, |
|
"learning_rate": 0.00011821646116483335, |
|
"loss": 2.2838, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.44783251792012746, |
|
"grad_norm": 0.473910391330719, |
|
"learning_rate": 0.00011793272848625797, |
|
"loss": 2.0599, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.44874274661508706, |
|
"grad_norm": 0.5086584091186523, |
|
"learning_rate": 0.0001176488465660271, |
|
"loss": 2.1578, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.44965297531004667, |
|
"grad_norm": 0.5282394886016846, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 2.2965, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.45056320400500627, |
|
"grad_norm": 0.5987780094146729, |
|
"learning_rate": 0.00011708064445203042, |
|
"loss": 2.3542, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.45147343269996587, |
|
"grad_norm": 0.5943189859390259, |
|
"learning_rate": 0.00011679632898701649, |
|
"loss": 2.4294, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.4523836613949255, |
|
"grad_norm": 0.6443737149238586, |
|
"learning_rate": 0.0001165118737378116, |
|
"loss": 2.605, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.4532938900898851, |
|
"grad_norm": 0.7082577347755432, |
|
"learning_rate": 0.00011622728107173946, |
|
"loss": 2.4254, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.4542041187848447, |
|
"grad_norm": 0.8503845930099487, |
|
"learning_rate": 0.00011594255335726724, |
|
"loss": 2.5187, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.4551143474798043, |
|
"grad_norm": 1.6775977611541748, |
|
"learning_rate": 0.00011565769296398618, |
|
"loss": 2.6669, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.4560245761747639, |
|
"grad_norm": 0.45572495460510254, |
|
"learning_rate": 0.00011537270226259169, |
|
"loss": 2.5806, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 0.45693480486972354, |
|
"grad_norm": 0.45138293504714966, |
|
"learning_rate": 0.00011508758362486358, |
|
"loss": 2.3935, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.45784503356468315, |
|
"grad_norm": 0.4548013210296631, |
|
"learning_rate": 0.00011480233942364645, |
|
"loss": 2.321, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 0.45875526225964275, |
|
"grad_norm": 0.434442400932312, |
|
"learning_rate": 0.00011451697203282982, |
|
"loss": 2.375, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.45966549095460235, |
|
"grad_norm": 0.4139295816421509, |
|
"learning_rate": 0.00011423148382732853, |
|
"loss": 2.3997, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.46057571964956195, |
|
"grad_norm": 0.46020230650901794, |
|
"learning_rate": 0.00011394587718306275, |
|
"loss": 2.5745, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.46148594834452156, |
|
"grad_norm": 0.4194343090057373, |
|
"learning_rate": 0.00011366015447693837, |
|
"loss": 2.2597, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 0.46239617703948116, |
|
"grad_norm": 0.43983832001686096, |
|
"learning_rate": 0.0001133743180868273, |
|
"loss": 2.3511, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.46330640573444076, |
|
"grad_norm": 0.41047292947769165, |
|
"learning_rate": 0.00011308837039154739, |
|
"loss": 2.2614, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 0.46421663442940037, |
|
"grad_norm": 0.4110110104084015, |
|
"learning_rate": 0.0001128023137708429, |
|
"loss": 2.2719, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.46512686312435997, |
|
"grad_norm": 0.41848358511924744, |
|
"learning_rate": 0.0001125161506053646, |
|
"loss": 2.3872, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 0.4660370918193196, |
|
"grad_norm": 0.39852631092071533, |
|
"learning_rate": 0.00011222988327664997, |
|
"loss": 2.2001, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.46694732051427923, |
|
"grad_norm": 0.4060978293418884, |
|
"learning_rate": 0.00011194351416710324, |
|
"loss": 2.2474, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 0.46785754920923883, |
|
"grad_norm": 0.4010358452796936, |
|
"learning_rate": 0.00011165704565997593, |
|
"loss": 2.1262, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.46876777790419843, |
|
"grad_norm": 0.4063378572463989, |
|
"learning_rate": 0.00011137048013934656, |
|
"loss": 2.1583, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.46967800659915804, |
|
"grad_norm": 0.40287846326828003, |
|
"learning_rate": 0.00011108381999010111, |
|
"loss": 2.2351, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.47058823529411764, |
|
"grad_norm": 0.3861018717288971, |
|
"learning_rate": 0.00011079706759791311, |
|
"loss": 2.195, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 0.47149846398907724, |
|
"grad_norm": 0.38855546712875366, |
|
"learning_rate": 0.00011051022534922371, |
|
"loss": 2.1575, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.47240869268403685, |
|
"grad_norm": 0.3941628038883209, |
|
"learning_rate": 0.00011022329563122191, |
|
"loss": 2.2324, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 0.47331892137899645, |
|
"grad_norm": 0.40604814887046814, |
|
"learning_rate": 0.00010993628083182467, |
|
"loss": 2.1641, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.4742291500739561, |
|
"grad_norm": 0.407815158367157, |
|
"learning_rate": 0.000109649183339657, |
|
"loss": 2.1648, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 0.4751393787689157, |
|
"grad_norm": 0.400680810213089, |
|
"learning_rate": 0.00010936200554403209, |
|
"loss": 2.1939, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.4760496074638753, |
|
"grad_norm": 0.416537344455719, |
|
"learning_rate": 0.00010907474983493144, |
|
"loss": 2.1694, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 0.4769598361588349, |
|
"grad_norm": 0.4097869396209717, |
|
"learning_rate": 0.00010878741860298503, |
|
"loss": 2.1785, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.4778700648537945, |
|
"grad_norm": 0.4243004024028778, |
|
"learning_rate": 0.00010850001423945126, |
|
"loss": 1.9963, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.4787802935487541, |
|
"grad_norm": 0.41958731412887573, |
|
"learning_rate": 0.00010821253913619726, |
|
"loss": 2.1629, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.4796905222437137, |
|
"grad_norm": 0.4177284240722656, |
|
"learning_rate": 0.00010792499568567884, |
|
"loss": 2.1276, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 0.4806007509386733, |
|
"grad_norm": 0.41077664494514465, |
|
"learning_rate": 0.00010763738628092062, |
|
"loss": 2.0852, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.48151097963363293, |
|
"grad_norm": 0.4098223149776459, |
|
"learning_rate": 0.00010734971331549603, |
|
"loss": 1.9977, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 0.48242120832859253, |
|
"grad_norm": 0.42255935072898865, |
|
"learning_rate": 0.00010706197918350758, |
|
"loss": 1.9822, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.4833314370235522, |
|
"grad_norm": 0.45597127079963684, |
|
"learning_rate": 0.0001067741862795668, |
|
"loss": 2.1072, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 0.4842416657185118, |
|
"grad_norm": 0.4538208544254303, |
|
"learning_rate": 0.0001064863369987743, |
|
"loss": 2.41, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.4851518944134714, |
|
"grad_norm": 0.4586673676967621, |
|
"learning_rate": 0.00010619843373669993, |
|
"loss": 2.1736, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 0.486062123108431, |
|
"grad_norm": 0.4433608055114746, |
|
"learning_rate": 0.00010591047888936274, |
|
"loss": 2.1324, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.4869723518033906, |
|
"grad_norm": 0.4421234428882599, |
|
"learning_rate": 0.00010562247485321115, |
|
"loss": 2.0689, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.4878825804983502, |
|
"grad_norm": 0.46843069791793823, |
|
"learning_rate": 0.00010533442402510284, |
|
"loss": 2.2252, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.4887928091933098, |
|
"grad_norm": 0.4747142493724823, |
|
"learning_rate": 0.00010504632880228498, |
|
"loss": 2.2503, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 0.4897030378882694, |
|
"grad_norm": 0.46643224358558655, |
|
"learning_rate": 0.00010475819158237425, |
|
"loss": 2.2628, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 0.490613266583229, |
|
"grad_norm": 0.47085490822792053, |
|
"learning_rate": 0.00010447001476333673, |
|
"loss": 2.0888, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 0.49152349527818867, |
|
"grad_norm": 0.5102598071098328, |
|
"learning_rate": 0.00010418180074346815, |
|
"loss": 2.2736, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.4924337239731483, |
|
"grad_norm": 0.49878573417663574, |
|
"learning_rate": 0.00010389355192137377, |
|
"loss": 2.1107, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 0.4933439526681079, |
|
"grad_norm": 0.5236616134643555, |
|
"learning_rate": 0.00010360527069594859, |
|
"loss": 2.4099, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 0.4942541813630675, |
|
"grad_norm": 0.49875032901763916, |
|
"learning_rate": 0.00010331695946635708, |
|
"loss": 2.1381, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 0.4951644100580271, |
|
"grad_norm": 0.5333012938499451, |
|
"learning_rate": 0.00010302862063201367, |
|
"loss": 2.2274, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.4960746387529867, |
|
"grad_norm": 0.5504993200302124, |
|
"learning_rate": 0.00010274025659256232, |
|
"loss": 2.2348, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.4969848674479463, |
|
"grad_norm": 0.5924202799797058, |
|
"learning_rate": 0.00010245186974785685, |
|
"loss": 2.3686, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.4978950961429059, |
|
"grad_norm": 0.6003567576408386, |
|
"learning_rate": 0.00010216346249794087, |
|
"loss": 2.3336, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 0.4988053248378655, |
|
"grad_norm": 0.6700019836425781, |
|
"learning_rate": 0.00010187503724302776, |
|
"loss": 2.4446, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 0.4997155535328251, |
|
"grad_norm": 0.8171781897544861, |
|
"learning_rate": 0.00010158659638348081, |
|
"loss": 2.4278, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 0.5006257822277848, |
|
"grad_norm": 1.4212020635604858, |
|
"learning_rate": 0.0001012981423197931, |
|
"loss": 2.6229, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.5006257822277848, |
|
"eval_loss": 2.2479705810546875, |
|
"eval_runtime": 205.3622, |
|
"eval_samples_per_second": 9.013, |
|
"eval_steps_per_second": 4.509, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.5015360109227444, |
|
"grad_norm": 0.4380887746810913, |
|
"learning_rate": 0.00010100967745256766, |
|
"loss": 2.4596, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 0.502446239617704, |
|
"grad_norm": 0.4522015154361725, |
|
"learning_rate": 0.00010072120418249745, |
|
"loss": 2.3217, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.5033564683126636, |
|
"grad_norm": 0.4581010043621063, |
|
"learning_rate": 0.00010043272491034523, |
|
"loss": 2.4948, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 0.5042666970076232, |
|
"grad_norm": 0.4353744685649872, |
|
"learning_rate": 0.00010014424203692388, |
|
"loss": 2.3769, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 0.5051769257025828, |
|
"grad_norm": 0.4150161147117615, |
|
"learning_rate": 9.985575796307615e-05, |
|
"loss": 2.3557, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.5060871543975424, |
|
"grad_norm": 0.4532707631587982, |
|
"learning_rate": 9.956727508965481e-05, |
|
"loss": 2.3114, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 0.506997383092502, |
|
"grad_norm": 0.42450255155563354, |
|
"learning_rate": 9.927879581750259e-05, |
|
"loss": 2.2911, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 0.5079076117874616, |
|
"grad_norm": 0.42910221219062805, |
|
"learning_rate": 9.899032254743235e-05, |
|
"loss": 2.3062, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 0.5088178404824212, |
|
"grad_norm": 0.42122408747673035, |
|
"learning_rate": 9.870185768020693e-05, |
|
"loss": 2.3294, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 0.5097280691773808, |
|
"grad_norm": 0.4562203884124756, |
|
"learning_rate": 9.84134036165192e-05, |
|
"loss": 2.2674, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.5106382978723404, |
|
"grad_norm": 0.3905525207519531, |
|
"learning_rate": 9.812496275697226e-05, |
|
"loss": 2.1259, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 0.5115485265673, |
|
"grad_norm": 0.41641199588775635, |
|
"learning_rate": 9.783653750205915e-05, |
|
"loss": 2.2191, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 0.5124587552622596, |
|
"grad_norm": 0.4090450704097748, |
|
"learning_rate": 9.754813025214317e-05, |
|
"loss": 2.2478, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 0.5133689839572193, |
|
"grad_norm": 0.4293293356895447, |
|
"learning_rate": 9.725974340743769e-05, |
|
"loss": 2.3854, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 0.5142792126521789, |
|
"grad_norm": 0.4138126075267792, |
|
"learning_rate": 9.697137936798634e-05, |
|
"loss": 2.2903, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.5151894413471385, |
|
"grad_norm": 0.3979492783546448, |
|
"learning_rate": 9.668304053364294e-05, |
|
"loss": 2.1878, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 0.5160996700420981, |
|
"grad_norm": 0.38530561327934265, |
|
"learning_rate": 9.639472930405143e-05, |
|
"loss": 2.1464, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 0.5170098987370577, |
|
"grad_norm": 0.4263143837451935, |
|
"learning_rate": 9.610644807862625e-05, |
|
"loss": 2.1856, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 0.5179201274320173, |
|
"grad_norm": 0.41127926111221313, |
|
"learning_rate": 9.581819925653188e-05, |
|
"loss": 2.198, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 0.5188303561269769, |
|
"grad_norm": 0.3917299509048462, |
|
"learning_rate": 9.552998523666326e-05, |
|
"loss": 2.1325, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.5197405848219365, |
|
"grad_norm": 0.39180508255958557, |
|
"learning_rate": 9.524180841762577e-05, |
|
"loss": 2.0702, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 0.5206508135168961, |
|
"grad_norm": 0.3829837739467621, |
|
"learning_rate": 9.495367119771503e-05, |
|
"loss": 1.8913, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 0.5215610422118557, |
|
"grad_norm": 0.412706196308136, |
|
"learning_rate": 9.46655759748972e-05, |
|
"loss": 2.1221, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 0.5224712709068153, |
|
"grad_norm": 0.39748141169548035, |
|
"learning_rate": 9.437752514678887e-05, |
|
"loss": 2.0427, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 0.5233814996017749, |
|
"grad_norm": 0.42854538559913635, |
|
"learning_rate": 9.408952111063727e-05, |
|
"loss": 2.121, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.5242917282967345, |
|
"grad_norm": 0.414654016494751, |
|
"learning_rate": 9.380156626330009e-05, |
|
"loss": 2.038, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.5252019569916941, |
|
"grad_norm": 0.4241427183151245, |
|
"learning_rate": 9.35136630012257e-05, |
|
"loss": 2.1312, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 0.5261121856866537, |
|
"grad_norm": 0.42928779125213623, |
|
"learning_rate": 9.322581372043321e-05, |
|
"loss": 2.1875, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 0.5270224143816133, |
|
"grad_norm": 0.4133308231830597, |
|
"learning_rate": 9.293802081649243e-05, |
|
"loss": 2.0477, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 0.527932643076573, |
|
"grad_norm": 0.427898645401001, |
|
"learning_rate": 9.265028668450402e-05, |
|
"loss": 2.0833, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.5288428717715326, |
|
"grad_norm": 0.4321751892566681, |
|
"learning_rate": 9.23626137190794e-05, |
|
"loss": 2.1698, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 0.5297531004664922, |
|
"grad_norm": 0.4715782105922699, |
|
"learning_rate": 9.207500431432115e-05, |
|
"loss": 2.1347, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 0.5306633291614519, |
|
"grad_norm": 0.45599547028541565, |
|
"learning_rate": 9.178746086380275e-05, |
|
"loss": 2.1469, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 0.5315735578564115, |
|
"grad_norm": 0.45286545157432556, |
|
"learning_rate": 9.149998576054874e-05, |
|
"loss": 2.2013, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 0.5324837865513711, |
|
"grad_norm": 0.47089457511901855, |
|
"learning_rate": 9.121258139701502e-05, |
|
"loss": 2.2125, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.5333940152463307, |
|
"grad_norm": 0.46750229597091675, |
|
"learning_rate": 9.092525016506858e-05, |
|
"loss": 2.1186, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 0.5343042439412903, |
|
"grad_norm": 0.4931905269622803, |
|
"learning_rate": 9.063799445596795e-05, |
|
"loss": 2.2185, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 0.5352144726362499, |
|
"grad_norm": 0.48538026213645935, |
|
"learning_rate": 9.035081666034304e-05, |
|
"loss": 2.2369, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.5361247013312095, |
|
"grad_norm": 0.4944066107273102, |
|
"learning_rate": 9.006371916817534e-05, |
|
"loss": 2.2771, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 0.5370349300261691, |
|
"grad_norm": 0.4564894139766693, |
|
"learning_rate": 8.977670436877811e-05, |
|
"loss": 2.0879, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.5379451587211287, |
|
"grad_norm": 0.5046347379684448, |
|
"learning_rate": 8.948977465077632e-05, |
|
"loss": 2.2197, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 0.5388553874160883, |
|
"grad_norm": 0.49683472514152527, |
|
"learning_rate": 8.920293240208694e-05, |
|
"loss": 2.2152, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 0.5397656161110479, |
|
"grad_norm": 0.5223331451416016, |
|
"learning_rate": 8.891618000989891e-05, |
|
"loss": 2.3358, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 0.5406758448060075, |
|
"grad_norm": 0.5552563667297363, |
|
"learning_rate": 8.862951986065345e-05, |
|
"loss": 2.1608, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 0.5415860735009671, |
|
"grad_norm": 0.5853347778320312, |
|
"learning_rate": 8.83429543400241e-05, |
|
"loss": 2.3679, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.5424963021959267, |
|
"grad_norm": 0.5858141183853149, |
|
"learning_rate": 8.805648583289674e-05, |
|
"loss": 2.341, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 0.5434065308908863, |
|
"grad_norm": 0.6405509114265442, |
|
"learning_rate": 8.777011672335008e-05, |
|
"loss": 2.4773, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 0.5443167595858459, |
|
"grad_norm": 0.7342801094055176, |
|
"learning_rate": 8.748384939463543e-05, |
|
"loss": 2.557, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 0.5452269882808055, |
|
"grad_norm": 0.8813995122909546, |
|
"learning_rate": 8.719768622915714e-05, |
|
"loss": 2.5595, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 0.5461372169757651, |
|
"grad_norm": 1.722114086151123, |
|
"learning_rate": 8.691162960845264e-05, |
|
"loss": 2.7211, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.5470474456707247, |
|
"grad_norm": 0.424265056848526, |
|
"learning_rate": 8.662568191317273e-05, |
|
"loss": 2.3728, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 0.5479576743656844, |
|
"grad_norm": 0.45933809876441956, |
|
"learning_rate": 8.633984552306164e-05, |
|
"loss": 2.4234, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 0.548867903060644, |
|
"grad_norm": 0.45455530285835266, |
|
"learning_rate": 8.605412281693727e-05, |
|
"loss": 2.5062, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 0.5497781317556036, |
|
"grad_norm": 0.4334143400192261, |
|
"learning_rate": 8.57685161726715e-05, |
|
"loss": 2.3087, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 0.5506883604505632, |
|
"grad_norm": 0.4537433385848999, |
|
"learning_rate": 8.548302796717019e-05, |
|
"loss": 2.395, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.5515985891455228, |
|
"grad_norm": 0.43673837184906006, |
|
"learning_rate": 8.519766057635355e-05, |
|
"loss": 2.3855, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 0.5525088178404824, |
|
"grad_norm": 0.43078145384788513, |
|
"learning_rate": 8.491241637513644e-05, |
|
"loss": 2.2559, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 0.553419046535442, |
|
"grad_norm": 0.4094640612602234, |
|
"learning_rate": 8.462729773740832e-05, |
|
"loss": 2.295, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 0.5543292752304017, |
|
"grad_norm": 0.4126126170158386, |
|
"learning_rate": 8.434230703601384e-05, |
|
"loss": 2.2019, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 0.5552395039253613, |
|
"grad_norm": 0.4372231066226959, |
|
"learning_rate": 8.405744664273278e-05, |
|
"loss": 2.4243, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.5561497326203209, |
|
"grad_norm": 0.42160138487815857, |
|
"learning_rate": 8.37727189282606e-05, |
|
"loss": 2.2805, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 0.5570599613152805, |
|
"grad_norm": 0.4336857795715332, |
|
"learning_rate": 8.34881262621884e-05, |
|
"loss": 2.4811, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 0.5579701900102401, |
|
"grad_norm": 0.40520837903022766, |
|
"learning_rate": 8.320367101298351e-05, |
|
"loss": 2.1731, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 0.5588804187051997, |
|
"grad_norm": 0.42664197087287903, |
|
"learning_rate": 8.291935554796962e-05, |
|
"loss": 2.3403, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 0.5597906474001593, |
|
"grad_norm": 0.4109039902687073, |
|
"learning_rate": 8.263518223330697e-05, |
|
"loss": 2.2425, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.5607008760951189, |
|
"grad_norm": 0.4032575786113739, |
|
"learning_rate": 8.235115343397295e-05, |
|
"loss": 2.2593, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.5616111047900785, |
|
"grad_norm": 0.3929396867752075, |
|
"learning_rate": 8.206727151374207e-05, |
|
"loss": 2.0896, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 0.5625213334850381, |
|
"grad_norm": 0.38567835092544556, |
|
"learning_rate": 8.178353883516664e-05, |
|
"loss": 2.0715, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 0.5634315621799977, |
|
"grad_norm": 0.405369371175766, |
|
"learning_rate": 8.149995775955686e-05, |
|
"loss": 2.2249, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 0.5643417908749573, |
|
"grad_norm": 0.3889697790145874, |
|
"learning_rate": 8.121653064696118e-05, |
|
"loss": 2.0797, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.565252019569917, |
|
"grad_norm": 0.4065384864807129, |
|
"learning_rate": 8.093325985614685e-05, |
|
"loss": 2.2012, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 0.5661622482648766, |
|
"grad_norm": 0.4066416323184967, |
|
"learning_rate": 8.065014774458003e-05, |
|
"loss": 2.1183, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 0.5670724769598362, |
|
"grad_norm": 0.40575870871543884, |
|
"learning_rate": 8.036719666840647e-05, |
|
"loss": 2.0258, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 0.5679827056547958, |
|
"grad_norm": 0.42911243438720703, |
|
"learning_rate": 8.008440898243149e-05, |
|
"loss": 2.1186, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 0.5688929343497554, |
|
"grad_norm": 0.4009549021720886, |
|
"learning_rate": 7.980178704010089e-05, |
|
"loss": 2.0049, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.569803163044715, |
|
"grad_norm": 0.41156989336013794, |
|
"learning_rate": 7.951933319348095e-05, |
|
"loss": 2.0272, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 0.5707133917396746, |
|
"grad_norm": 0.4248954653739929, |
|
"learning_rate": 7.923704979323899e-05, |
|
"loss": 2.077, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 0.5716236204346342, |
|
"grad_norm": 0.45484524965286255, |
|
"learning_rate": 7.895493918862396e-05, |
|
"loss": 2.2255, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 0.5725338491295938, |
|
"grad_norm": 0.4571921229362488, |
|
"learning_rate": 7.867300372744657e-05, |
|
"loss": 2.1373, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 0.5734440778245534, |
|
"grad_norm": 0.44238901138305664, |
|
"learning_rate": 7.839124575606004e-05, |
|
"loss": 2.1147, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.574354306519513, |
|
"grad_norm": 0.4206310510635376, |
|
"learning_rate": 7.810966761934053e-05, |
|
"loss": 2.0508, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 0.5752645352144726, |
|
"grad_norm": 0.43381330370903015, |
|
"learning_rate": 7.782827166066739e-05, |
|
"loss": 2.0847, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 0.5761747639094322, |
|
"grad_norm": 0.4460139572620392, |
|
"learning_rate": 7.754706022190398e-05, |
|
"loss": 2.1288, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 0.5770849926043918, |
|
"grad_norm": 0.4371720850467682, |
|
"learning_rate": 7.726603564337791e-05, |
|
"loss": 2.0476, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 0.5779952212993514, |
|
"grad_norm": 0.4623599052429199, |
|
"learning_rate": 7.69852002638618e-05, |
|
"loss": 2.2858, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.578905449994311, |
|
"grad_norm": 0.4422992765903473, |
|
"learning_rate": 7.670455642055361e-05, |
|
"loss": 2.1072, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 0.5798156786892706, |
|
"grad_norm": 0.4804936647415161, |
|
"learning_rate": 7.642410644905726e-05, |
|
"loss": 2.2218, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 0.5807259073842302, |
|
"grad_norm": 0.48249900341033936, |
|
"learning_rate": 7.614385268336336e-05, |
|
"loss": 2.2916, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 0.5816361360791898, |
|
"grad_norm": 0.46635982394218445, |
|
"learning_rate": 7.586379745582944e-05, |
|
"loss": 2.1636, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 0.5825463647741494, |
|
"grad_norm": 0.4670505225658417, |
|
"learning_rate": 7.558394309716088e-05, |
|
"loss": 2.2052, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.5834565934691092, |
|
"grad_norm": 0.49475541710853577, |
|
"learning_rate": 7.530429193639128e-05, |
|
"loss": 2.18, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 0.5843668221640688, |
|
"grad_norm": 0.5231596231460571, |
|
"learning_rate": 7.502484630086318e-05, |
|
"loss": 2.2095, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 0.5852770508590284, |
|
"grad_norm": 0.5045900344848633, |
|
"learning_rate": 7.474560851620873e-05, |
|
"loss": 2.053, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 0.586187279553988, |
|
"grad_norm": 0.5511046051979065, |
|
"learning_rate": 7.446658090633026e-05, |
|
"loss": 2.2706, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 0.5870975082489476, |
|
"grad_norm": 0.5700446963310242, |
|
"learning_rate": 7.41877657933809e-05, |
|
"loss": 2.3999, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.5880077369439072, |
|
"grad_norm": 0.5792605876922607, |
|
"learning_rate": 7.390916549774536e-05, |
|
"loss": 2.2391, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 0.5889179656388668, |
|
"grad_norm": 0.6770455241203308, |
|
"learning_rate": 7.363078233802063e-05, |
|
"loss": 2.6564, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 0.5898281943338264, |
|
"grad_norm": 0.7092955708503723, |
|
"learning_rate": 7.335261863099651e-05, |
|
"loss": 2.4722, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 0.590738423028786, |
|
"grad_norm": 0.8125056028366089, |
|
"learning_rate": 7.307467669163655e-05, |
|
"loss": 2.3581, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 0.5916486517237456, |
|
"grad_norm": 1.5941253900527954, |
|
"learning_rate": 7.279695883305866e-05, |
|
"loss": 2.16, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.5925588804187052, |
|
"grad_norm": 0.44398507475852966, |
|
"learning_rate": 7.251946736651582e-05, |
|
"loss": 2.4689, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 0.5934691091136648, |
|
"grad_norm": 0.4264509975910187, |
|
"learning_rate": 7.224220460137701e-05, |
|
"loss": 2.4081, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 0.5943793378086244, |
|
"grad_norm": 0.4222484529018402, |
|
"learning_rate": 7.196517284510773e-05, |
|
"loss": 2.3827, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 0.595289566503584, |
|
"grad_norm": 0.4516051411628723, |
|
"learning_rate": 7.168837440325114e-05, |
|
"loss": 2.399, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 0.5961997951985436, |
|
"grad_norm": 0.4370306730270386, |
|
"learning_rate": 7.141181157940859e-05, |
|
"loss": 2.3837, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.5971100238935032, |
|
"grad_norm": 0.4236253798007965, |
|
"learning_rate": 7.11354866752205e-05, |
|
"loss": 2.3066, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 0.5980202525884628, |
|
"grad_norm": 0.41718846559524536, |
|
"learning_rate": 7.085940199034735e-05, |
|
"loss": 2.3841, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 0.5989304812834224, |
|
"grad_norm": 0.4299750030040741, |
|
"learning_rate": 7.058355982245037e-05, |
|
"loss": 2.3842, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 0.599840709978382, |
|
"grad_norm": 0.4180915057659149, |
|
"learning_rate": 7.030796246717255e-05, |
|
"loss": 2.0758, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 0.6007509386733417, |
|
"grad_norm": 0.45195114612579346, |
|
"learning_rate": 7.003261221811934e-05, |
|
"loss": 2.4826, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.6016611673683013, |
|
"grad_norm": 0.4253404140472412, |
|
"learning_rate": 6.97575113668399e-05, |
|
"loss": 2.3705, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 0.6025713960632609, |
|
"grad_norm": 0.4198931157588959, |
|
"learning_rate": 6.948266220280771e-05, |
|
"loss": 2.3396, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 0.6034816247582205, |
|
"grad_norm": 0.43457460403442383, |
|
"learning_rate": 6.920806701340155e-05, |
|
"loss": 2.1447, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 0.6043918534531801, |
|
"grad_norm": 0.40161874890327454, |
|
"learning_rate": 6.893372808388675e-05, |
|
"loss": 2.2443, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 0.6053020821481397, |
|
"grad_norm": 0.4039609432220459, |
|
"learning_rate": 6.865964769739575e-05, |
|
"loss": 2.1815, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.6062123108430993, |
|
"grad_norm": 0.4061351716518402, |
|
"learning_rate": 6.838582813490947e-05, |
|
"loss": 2.1073, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 0.607122539538059, |
|
"grad_norm": 0.4206211268901825, |
|
"learning_rate": 6.811227167523815e-05, |
|
"loss": 2.2549, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 0.6080327682330186, |
|
"grad_norm": 0.3936857283115387, |
|
"learning_rate": 6.783898059500233e-05, |
|
"loss": 2.1373, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 0.6089429969279782, |
|
"grad_norm": 0.3954029083251953, |
|
"learning_rate": 6.756595716861407e-05, |
|
"loss": 2.1001, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 0.6098532256229378, |
|
"grad_norm": 0.407713919878006, |
|
"learning_rate": 6.729320366825784e-05, |
|
"loss": 2.0967, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.6107634543178974, |
|
"grad_norm": 0.41096213459968567, |
|
"learning_rate": 6.702072236387182e-05, |
|
"loss": 2.0899, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 0.611673683012857, |
|
"grad_norm": 0.40465790033340454, |
|
"learning_rate": 6.674851552312878e-05, |
|
"loss": 2.089, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 0.6125839117078166, |
|
"grad_norm": 0.3991434574127197, |
|
"learning_rate": 6.647658541141735e-05, |
|
"loss": 1.9788, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 0.6134941404027762, |
|
"grad_norm": 0.42327383160591125, |
|
"learning_rate": 6.620493429182323e-05, |
|
"loss": 2.1672, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 0.6144043690977358, |
|
"grad_norm": 0.4061299264431, |
|
"learning_rate": 6.593356442511015e-05, |
|
"loss": 2.1617, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.6153145977926954, |
|
"grad_norm": 0.4082658588886261, |
|
"learning_rate": 6.566247806970119e-05, |
|
"loss": 2.0112, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 0.616224826487655, |
|
"grad_norm": 0.43016669154167175, |
|
"learning_rate": 6.539167748165994e-05, |
|
"loss": 2.0, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 0.6171350551826146, |
|
"grad_norm": 0.43142226338386536, |
|
"learning_rate": 6.512116491467185e-05, |
|
"loss": 2.1585, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 0.6180452838775743, |
|
"grad_norm": 0.4271491467952728, |
|
"learning_rate": 6.485094262002529e-05, |
|
"loss": 1.9628, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 0.6189555125725339, |
|
"grad_norm": 0.44002971053123474, |
|
"learning_rate": 6.458101284659286e-05, |
|
"loss": 2.2214, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.6198657412674935, |
|
"grad_norm": 0.4215126931667328, |
|
"learning_rate": 6.431137784081282e-05, |
|
"loss": 2.0377, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 0.6207759699624531, |
|
"grad_norm": 0.46792343258857727, |
|
"learning_rate": 6.404203984667019e-05, |
|
"loss": 2.029, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 0.6216861986574127, |
|
"grad_norm": 0.45737308263778687, |
|
"learning_rate": 6.377300110567821e-05, |
|
"loss": 2.2375, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 0.6225964273523723, |
|
"grad_norm": 0.4526033401489258, |
|
"learning_rate": 6.350426385685957e-05, |
|
"loss": 2.2562, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 0.6235066560473319, |
|
"grad_norm": 0.45917776226997375, |
|
"learning_rate": 6.323583033672799e-05, |
|
"loss": 2.1321, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.6244168847422915, |
|
"grad_norm": 0.4713301658630371, |
|
"learning_rate": 6.296770277926937e-05, |
|
"loss": 2.07, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 0.6253271134372511, |
|
"grad_norm": 0.5036799907684326, |
|
"learning_rate": 6.269988341592328e-05, |
|
"loss": 2.1103, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 0.6262373421322107, |
|
"grad_norm": 0.4843004643917084, |
|
"learning_rate": 6.243237447556449e-05, |
|
"loss": 2.0936, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 0.6271475708271703, |
|
"grad_norm": 0.4738497734069824, |
|
"learning_rate": 6.216517818448423e-05, |
|
"loss": 2.1004, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 0.6280577995221299, |
|
"grad_norm": 0.5081862211227417, |
|
"learning_rate": 6.189829676637182e-05, |
|
"loss": 2.2177, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.6289680282170895, |
|
"grad_norm": 0.5060831904411316, |
|
"learning_rate": 6.163173244229619e-05, |
|
"loss": 2.1342, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 0.6298782569120491, |
|
"grad_norm": 0.5047332644462585, |
|
"learning_rate": 6.136548743068713e-05, |
|
"loss": 2.0727, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 0.6307884856070087, |
|
"grad_norm": 0.5174648761749268, |
|
"learning_rate": 6.109956394731722e-05, |
|
"loss": 2.0623, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 0.6316987143019683, |
|
"grad_norm": 0.5705139636993408, |
|
"learning_rate": 6.083396420528298e-05, |
|
"loss": 2.4454, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 0.6326089429969279, |
|
"grad_norm": 0.5653088092803955, |
|
"learning_rate": 6.056869041498687e-05, |
|
"loss": 2.2071, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.6335191716918875, |
|
"grad_norm": 0.6105946898460388, |
|
"learning_rate": 6.030374478411847e-05, |
|
"loss": 2.3081, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 0.6344294003868471, |
|
"grad_norm": 0.6362658143043518, |
|
"learning_rate": 6.0039129517636435e-05, |
|
"loss": 2.3426, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 0.6353396290818069, |
|
"grad_norm": 0.7242766618728638, |
|
"learning_rate": 5.9774846817750105e-05, |
|
"loss": 2.4877, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 0.6362498577767665, |
|
"grad_norm": 0.9446219205856323, |
|
"learning_rate": 5.951089888390087e-05, |
|
"loss": 2.7741, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 0.6371600864717261, |
|
"grad_norm": 1.5826219320297241, |
|
"learning_rate": 5.924728791274432e-05, |
|
"loss": 2.533, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.6380703151666857, |
|
"grad_norm": 0.4994019567966461, |
|
"learning_rate": 5.89840160981316e-05, |
|
"loss": 2.4388, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 0.6389805438616453, |
|
"grad_norm": 0.45213326811790466, |
|
"learning_rate": 5.872108563109131e-05, |
|
"loss": 2.3644, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 0.6398907725566049, |
|
"grad_norm": 0.45655617117881775, |
|
"learning_rate": 5.845849869981137e-05, |
|
"loss": 2.5202, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 0.6408010012515645, |
|
"grad_norm": 0.41640329360961914, |
|
"learning_rate": 5.819625748962049e-05, |
|
"loss": 2.3097, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 0.6417112299465241, |
|
"grad_norm": 0.43625307083129883, |
|
"learning_rate": 5.79343641829704e-05, |
|
"loss": 2.3931, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.6426214586414837, |
|
"grad_norm": 0.44178506731987, |
|
"learning_rate": 5.7672820959417254e-05, |
|
"loss": 2.3195, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 0.6435316873364433, |
|
"grad_norm": 0.4416089951992035, |
|
"learning_rate": 5.741162999560386e-05, |
|
"loss": 2.2446, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 0.6444419160314029, |
|
"grad_norm": 0.4419204890727997, |
|
"learning_rate": 5.7150793465241346e-05, |
|
"loss": 2.34, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 0.6453521447263625, |
|
"grad_norm": 0.45422717928886414, |
|
"learning_rate": 5.68903135390912e-05, |
|
"loss": 2.1915, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 0.6462623734213221, |
|
"grad_norm": 0.41635921597480774, |
|
"learning_rate": 5.663019238494704e-05, |
|
"loss": 2.3147, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.6471726021162817, |
|
"grad_norm": 0.4240402579307556, |
|
"learning_rate": 5.637043216761678e-05, |
|
"loss": 2.1693, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 0.6480828308112413, |
|
"grad_norm": 0.41989627480506897, |
|
"learning_rate": 5.611103504890444e-05, |
|
"loss": 2.2087, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 0.6489930595062009, |
|
"grad_norm": 0.4187220335006714, |
|
"learning_rate": 5.5852003187592226e-05, |
|
"loss": 2.3818, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 0.6499032882011605, |
|
"grad_norm": 0.43209201097488403, |
|
"learning_rate": 5.559333873942259e-05, |
|
"loss": 2.3176, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 0.6508135168961201, |
|
"grad_norm": 0.416291207075119, |
|
"learning_rate": 5.533504385708024e-05, |
|
"loss": 2.2397, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.6517237455910797, |
|
"grad_norm": 0.411857932806015, |
|
"learning_rate": 5.5077120690174246e-05, |
|
"loss": 2.1142, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 0.6526339742860394, |
|
"grad_norm": 0.4142412543296814, |
|
"learning_rate": 5.481957138522018e-05, |
|
"loss": 2.2226, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 0.653544202980999, |
|
"grad_norm": 0.4322018325328827, |
|
"learning_rate": 5.456239808562209e-05, |
|
"loss": 2.2078, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 0.6544544316759586, |
|
"grad_norm": 0.42210081219673157, |
|
"learning_rate": 5.4305602931655045e-05, |
|
"loss": 2.0579, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 0.6553646603709182, |
|
"grad_norm": 0.4076862335205078, |
|
"learning_rate": 5.404918806044679e-05, |
|
"loss": 2.1348, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.6562748890658778, |
|
"grad_norm": 0.43610820174217224, |
|
"learning_rate": 5.379315560596038e-05, |
|
"loss": 2.2212, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 0.6571851177608374, |
|
"grad_norm": 0.41793620586395264, |
|
"learning_rate": 5.3537507698976365e-05, |
|
"loss": 1.9606, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 0.658095346455797, |
|
"grad_norm": 0.40680915117263794, |
|
"learning_rate": 5.328224646707479e-05, |
|
"loss": 2.0167, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 0.6590055751507566, |
|
"grad_norm": 0.424699991941452, |
|
"learning_rate": 5.3027374034617785e-05, |
|
"loss": 2.1065, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 0.6599158038457162, |
|
"grad_norm": 0.43258509039878845, |
|
"learning_rate": 5.277289252273174e-05, |
|
"loss": 2.0974, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.6608260325406758, |
|
"grad_norm": 0.4318636655807495, |
|
"learning_rate": 5.251880404928971e-05, |
|
"loss": 2.3214, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 0.6617362612356354, |
|
"grad_norm": 0.4147786498069763, |
|
"learning_rate": 5.226511072889371e-05, |
|
"loss": 2.1223, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 0.662646489930595, |
|
"grad_norm": 0.4131387770175934, |
|
"learning_rate": 5.201181467285723e-05, |
|
"loss": 1.8335, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 0.6635567186255547, |
|
"grad_norm": 0.456827849149704, |
|
"learning_rate": 5.175891798918757e-05, |
|
"loss": 2.1428, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 0.6644669473205143, |
|
"grad_norm": 0.47478604316711426, |
|
"learning_rate": 5.1506422782568345e-05, |
|
"loss": 2.0526, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.6653771760154739, |
|
"grad_norm": 0.4357281029224396, |
|
"learning_rate": 5.125433115434197e-05, |
|
"loss": 1.8949, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 0.6662874047104335, |
|
"grad_norm": 0.45749080181121826, |
|
"learning_rate": 5.100264520249205e-05, |
|
"loss": 2.1637, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 0.6671976334053931, |
|
"grad_norm": 0.47157686948776245, |
|
"learning_rate": 5.0751367021626215e-05, |
|
"loss": 2.1036, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 0.6681078621003527, |
|
"grad_norm": 0.4490397274494171, |
|
"learning_rate": 5.050049870295841e-05, |
|
"loss": 1.9553, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 0.6690180907953123, |
|
"grad_norm": 0.4798765480518341, |
|
"learning_rate": 5.025004233429145e-05, |
|
"loss": 2.0954, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.669928319490272, |
|
"grad_norm": 0.5034172534942627, |
|
"learning_rate": 5.000000000000002e-05, |
|
"loss": 2.2027, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 0.6708385481852316, |
|
"grad_norm": 0.481141060590744, |
|
"learning_rate": 4.9750373781012885e-05, |
|
"loss": 2.0822, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 0.6717487768801912, |
|
"grad_norm": 0.49731481075286865, |
|
"learning_rate": 4.950116575479586e-05, |
|
"loss": 2.0196, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 0.6726590055751508, |
|
"grad_norm": 0.48321208357810974, |
|
"learning_rate": 4.9252377995334444e-05, |
|
"loss": 1.9995, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 0.6735692342701104, |
|
"grad_norm": 0.5173296332359314, |
|
"learning_rate": 4.90040125731165e-05, |
|
"loss": 2.2078, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.67447946296507, |
|
"grad_norm": 0.5107961297035217, |
|
"learning_rate": 4.87560715551151e-05, |
|
"loss": 2.0684, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 0.6753896916600296, |
|
"grad_norm": 0.522492527961731, |
|
"learning_rate": 4.85085570047713e-05, |
|
"loss": 2.2168, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 0.6762999203549892, |
|
"grad_norm": 0.5417767763137817, |
|
"learning_rate": 4.826147098197691e-05, |
|
"loss": 2.2379, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 0.6772101490499488, |
|
"grad_norm": 0.5735164284706116, |
|
"learning_rate": 4.8014815543057475e-05, |
|
"loss": 2.2132, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 0.6781203777449084, |
|
"grad_norm": 0.5819071531295776, |
|
"learning_rate": 4.776859274075506e-05, |
|
"loss": 2.2469, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.679030606439868, |
|
"grad_norm": 0.6113678216934204, |
|
"learning_rate": 4.752280462421117e-05, |
|
"loss": 2.3064, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 0.6799408351348276, |
|
"grad_norm": 0.6506679654121399, |
|
"learning_rate": 4.727745323894976e-05, |
|
"loss": 2.3311, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 0.6808510638297872, |
|
"grad_norm": 0.7372251152992249, |
|
"learning_rate": 4.703254062686017e-05, |
|
"loss": 2.575, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 0.6817612925247468, |
|
"grad_norm": 0.8235337734222412, |
|
"learning_rate": 4.678806882618003e-05, |
|
"loss": 2.4711, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 0.6826715212197064, |
|
"grad_norm": 1.295682430267334, |
|
"learning_rate": 4.654403987147865e-05, |
|
"loss": 2.5713, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.683581749914666, |
|
"grad_norm": 0.46585023403167725, |
|
"learning_rate": 4.630045579363957e-05, |
|
"loss": 2.4203, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 0.6844919786096256, |
|
"grad_norm": 0.4163340628147125, |
|
"learning_rate": 4.605731861984401e-05, |
|
"loss": 2.261, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 0.6854022073045852, |
|
"grad_norm": 0.4268229007720947, |
|
"learning_rate": 4.5814630373554115e-05, |
|
"loss": 2.3078, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 0.6863124359995448, |
|
"grad_norm": 0.43664565682411194, |
|
"learning_rate": 4.557239307449561e-05, |
|
"loss": 2.5044, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 0.6872226646945045, |
|
"grad_norm": 0.44144201278686523, |
|
"learning_rate": 4.5330608738641486e-05, |
|
"loss": 2.4192, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.6881328933894642, |
|
"grad_norm": 0.40867024660110474, |
|
"learning_rate": 4.508927937819499e-05, |
|
"loss": 2.1908, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 0.6890431220844238, |
|
"grad_norm": 0.4147084057331085, |
|
"learning_rate": 4.484840700157295e-05, |
|
"loss": 2.2864, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 0.6899533507793834, |
|
"grad_norm": 0.413703054189682, |
|
"learning_rate": 4.4607993613388976e-05, |
|
"loss": 2.2436, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 0.690863579474343, |
|
"grad_norm": 0.42309460043907166, |
|
"learning_rate": 4.436804121443689e-05, |
|
"loss": 2.3444, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 0.6917738081693026, |
|
"grad_norm": 0.4229698181152344, |
|
"learning_rate": 4.412855180167406e-05, |
|
"loss": 2.3264, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.6926840368642622, |
|
"grad_norm": 0.4247763156890869, |
|
"learning_rate": 4.388952736820453e-05, |
|
"loss": 2.257, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 0.6935942655592218, |
|
"grad_norm": 0.42383337020874023, |
|
"learning_rate": 4.365096990326297e-05, |
|
"loss": 2.1349, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 0.6945044942541814, |
|
"grad_norm": 0.4181368350982666, |
|
"learning_rate": 4.3412881392197526e-05, |
|
"loss": 2.2587, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 0.695414722949141, |
|
"grad_norm": 0.4086921811103821, |
|
"learning_rate": 4.317526381645363e-05, |
|
"loss": 2.2378, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 0.6963249516441006, |
|
"grad_norm": 0.4001654088497162, |
|
"learning_rate": 4.293811915355761e-05, |
|
"loss": 2.1708, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.6972351803390602, |
|
"grad_norm": 0.42960214614868164, |
|
"learning_rate": 4.270144937709981e-05, |
|
"loss": 2.1556, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 0.6981454090340198, |
|
"grad_norm": 0.42000848054885864, |
|
"learning_rate": 4.2465256456718615e-05, |
|
"loss": 2.1182, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 0.6990556377289794, |
|
"grad_norm": 0.394368052482605, |
|
"learning_rate": 4.222954235808378e-05, |
|
"loss": 2.0486, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 0.699965866423939, |
|
"grad_norm": 0.3934192359447479, |
|
"learning_rate": 4.19943090428802e-05, |
|
"loss": 1.9222, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 0.7008760951188986, |
|
"grad_norm": 0.44399651885032654, |
|
"learning_rate": 4.175955846879151e-05, |
|
"loss": 2.1621, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.7017863238138582, |
|
"grad_norm": 0.4081316888332367, |
|
"learning_rate": 4.1525292589483843e-05, |
|
"loss": 1.9534, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 0.7026965525088178, |
|
"grad_norm": 0.42008188366889954, |
|
"learning_rate": 4.129151335458957e-05, |
|
"loss": 1.9773, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 0.7036067812037774, |
|
"grad_norm": 0.4209793508052826, |
|
"learning_rate": 4.105822270969102e-05, |
|
"loss": 2.0403, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 0.7045170098987371, |
|
"grad_norm": 0.43969592452049255, |
|
"learning_rate": 4.0825422596304396e-05, |
|
"loss": 2.1796, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 0.7054272385936967, |
|
"grad_norm": 0.4333605468273163, |
|
"learning_rate": 4.059311495186338e-05, |
|
"loss": 2.1115, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.7063374672886563, |
|
"grad_norm": 0.42669251561164856, |
|
"learning_rate": 4.036130170970341e-05, |
|
"loss": 2.1563, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 0.7072476959836159, |
|
"grad_norm": 0.45064857602119446, |
|
"learning_rate": 4.012998479904525e-05, |
|
"loss": 2.2003, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 0.7081579246785755, |
|
"grad_norm": 0.4487575888633728, |
|
"learning_rate": 3.9899166144978904e-05, |
|
"loss": 2.1332, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 0.7090681533735351, |
|
"grad_norm": 0.4423072636127472, |
|
"learning_rate": 3.966884766844803e-05, |
|
"loss": 2.1449, |
|
"step": 779 |
|
}, |
|
{ |
|
"epoch": 0.7099783820684947, |
|
"grad_norm": 0.4436761736869812, |
|
"learning_rate": 3.943903128623335e-05, |
|
"loss": 2.0025, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.7108886107634543, |
|
"grad_norm": 0.4364500045776367, |
|
"learning_rate": 3.920971891093718e-05, |
|
"loss": 1.9834, |
|
"step": 781 |
|
}, |
|
{ |
|
"epoch": 0.7117988394584139, |
|
"grad_norm": 0.44009071588516235, |
|
"learning_rate": 3.8980912450967366e-05, |
|
"loss": 2.0204, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 0.7127090681533735, |
|
"grad_norm": 0.4498206079006195, |
|
"learning_rate": 3.875261381052121e-05, |
|
"loss": 2.0348, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 0.7136192968483331, |
|
"grad_norm": 0.48029908537864685, |
|
"learning_rate": 3.852482488956992e-05, |
|
"loss": 2.0383, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 0.7145295255432927, |
|
"grad_norm": 0.4986077845096588, |
|
"learning_rate": 3.829754758384262e-05, |
|
"loss": 2.3006, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.7154397542382523, |
|
"grad_norm": 0.5001522302627563, |
|
"learning_rate": 3.807078378481059e-05, |
|
"loss": 2.3416, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 0.716349982933212, |
|
"grad_norm": 0.5049505829811096, |
|
"learning_rate": 3.784453537967161e-05, |
|
"loss": 2.1652, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 0.7172602116281716, |
|
"grad_norm": 0.5048404932022095, |
|
"learning_rate": 3.761880425133413e-05, |
|
"loss": 2.1345, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 0.7181704403231312, |
|
"grad_norm": 0.4869529604911804, |
|
"learning_rate": 3.7393592278401704e-05, |
|
"loss": 2.0906, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 0.7190806690180908, |
|
"grad_norm": 0.5454188585281372, |
|
"learning_rate": 3.7168901335157315e-05, |
|
"loss": 2.4214, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.7199908977130504, |
|
"grad_norm": 0.5238876938819885, |
|
"learning_rate": 3.694473329154778e-05, |
|
"loss": 1.9798, |
|
"step": 791 |
|
}, |
|
{ |
|
"epoch": 0.72090112640801, |
|
"grad_norm": 0.5545910596847534, |
|
"learning_rate": 3.672109001316809e-05, |
|
"loss": 2.4726, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 0.7218113551029697, |
|
"grad_norm": 0.542072594165802, |
|
"learning_rate": 3.649797336124615e-05, |
|
"loss": 2.016, |
|
"step": 793 |
|
}, |
|
{ |
|
"epoch": 0.7227215837979293, |
|
"grad_norm": 0.5355279445648193, |
|
"learning_rate": 3.6275385192627056e-05, |
|
"loss": 2.1041, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 0.7236318124928889, |
|
"grad_norm": 0.5673330426216125, |
|
"learning_rate": 3.6053327359757535e-05, |
|
"loss": 2.1006, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.7245420411878485, |
|
"grad_norm": 0.6170483231544495, |
|
"learning_rate": 3.583180171067101e-05, |
|
"loss": 2.3275, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 0.7254522698828081, |
|
"grad_norm": 0.6877503991127014, |
|
"learning_rate": 3.5610810088971625e-05, |
|
"loss": 2.4504, |
|
"step": 797 |
|
}, |
|
{ |
|
"epoch": 0.7263624985777677, |
|
"grad_norm": 0.7676892280578613, |
|
"learning_rate": 3.5390354333819344e-05, |
|
"loss": 2.6627, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 0.7272727272727273, |
|
"grad_norm": 0.9272985458374023, |
|
"learning_rate": 3.517043627991441e-05, |
|
"loss": 2.5253, |
|
"step": 799 |
|
}, |
|
{ |
|
"epoch": 0.7281829559676869, |
|
"grad_norm": 1.9746160507202148, |
|
"learning_rate": 3.4951057757482205e-05, |
|
"loss": 2.5993, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.7290931846626465, |
|
"grad_norm": 0.4283389449119568, |
|
"learning_rate": 3.4732220592257946e-05, |
|
"loss": 2.5104, |
|
"step": 801 |
|
}, |
|
{ |
|
"epoch": 0.7300034133576061, |
|
"grad_norm": 0.43243661522865295, |
|
"learning_rate": 3.45139266054715e-05, |
|
"loss": 2.1707, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 0.7309136420525657, |
|
"grad_norm": 0.4262010455131531, |
|
"learning_rate": 3.429617761383222e-05, |
|
"loss": 2.2513, |
|
"step": 803 |
|
}, |
|
{ |
|
"epoch": 0.7318238707475253, |
|
"grad_norm": 0.4308786392211914, |
|
"learning_rate": 3.40789754295139e-05, |
|
"loss": 2.4042, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 0.7327340994424849, |
|
"grad_norm": 0.42787450551986694, |
|
"learning_rate": 3.3862321860139576e-05, |
|
"loss": 2.4259, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.7336443281374445, |
|
"grad_norm": 0.4381856620311737, |
|
"learning_rate": 3.364621870876659e-05, |
|
"loss": 2.4046, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 0.7345545568324041, |
|
"grad_norm": 0.4300837814807892, |
|
"learning_rate": 3.343066777387148e-05, |
|
"loss": 2.3713, |
|
"step": 807 |
|
}, |
|
{ |
|
"epoch": 0.7354647855273637, |
|
"grad_norm": 0.4228179156780243, |
|
"learning_rate": 3.3215670849335155e-05, |
|
"loss": 2.2606, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 0.7363750142223233, |
|
"grad_norm": 0.4267946779727936, |
|
"learning_rate": 3.300122972442773e-05, |
|
"loss": 2.3377, |
|
"step": 809 |
|
}, |
|
{ |
|
"epoch": 0.7372852429172829, |
|
"grad_norm": 0.41019585728645325, |
|
"learning_rate": 3.278734618379402e-05, |
|
"loss": 2.1903, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.7381954716122425, |
|
"grad_norm": 0.42949211597442627, |
|
"learning_rate": 3.257402200743821e-05, |
|
"loss": 2.3309, |
|
"step": 811 |
|
}, |
|
{ |
|
"epoch": 0.7391057003072021, |
|
"grad_norm": 0.4410031735897064, |
|
"learning_rate": 3.2361258970709397e-05, |
|
"loss": 2.3924, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 0.7400159290021618, |
|
"grad_norm": 0.4223659336566925, |
|
"learning_rate": 3.21490588442868e-05, |
|
"loss": 2.2614, |
|
"step": 813 |
|
}, |
|
{ |
|
"epoch": 0.7409261576971214, |
|
"grad_norm": 0.43348926305770874, |
|
"learning_rate": 3.19374233941647e-05, |
|
"loss": 2.4255, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 0.741836386392081, |
|
"grad_norm": 0.42184650897979736, |
|
"learning_rate": 3.172635438163816e-05, |
|
"loss": 2.2794, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.7427466150870407, |
|
"grad_norm": 0.4127393066883087, |
|
"learning_rate": 3.1515853563288076e-05, |
|
"loss": 2.1242, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 0.7436568437820003, |
|
"grad_norm": 0.40478646755218506, |
|
"learning_rate": 3.130592269096671e-05, |
|
"loss": 2.035, |
|
"step": 817 |
|
}, |
|
{ |
|
"epoch": 0.7445670724769599, |
|
"grad_norm": 0.41883495450019836, |
|
"learning_rate": 3.1096563511783014e-05, |
|
"loss": 2.1427, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 0.7454773011719195, |
|
"grad_norm": 0.39868757128715515, |
|
"learning_rate": 3.08877777680882e-05, |
|
"loss": 2.0169, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 0.7463875298668791, |
|
"grad_norm": 0.392415851354599, |
|
"learning_rate": 3.0679567197461134e-05, |
|
"loss": 2.0969, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.7472977585618387, |
|
"grad_norm": 0.4181436598300934, |
|
"learning_rate": 3.047193353269382e-05, |
|
"loss": 2.1766, |
|
"step": 821 |
|
}, |
|
{ |
|
"epoch": 0.7482079872567983, |
|
"grad_norm": 0.41824692487716675, |
|
"learning_rate": 3.0264878501777306e-05, |
|
"loss": 2.0897, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 0.7491182159517579, |
|
"grad_norm": 0.442640095949173, |
|
"learning_rate": 3.005840382788685e-05, |
|
"loss": 2.0851, |
|
"step": 823 |
|
}, |
|
{ |
|
"epoch": 0.7500284446467175, |
|
"grad_norm": 0.43662169575691223, |
|
"learning_rate": 2.9852511229367865e-05, |
|
"loss": 2.1546, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 0.7509386733416771, |
|
"grad_norm": 0.44712746143341064, |
|
"learning_rate": 2.9647202419721687e-05, |
|
"loss": 2.2304, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.7509386733416771, |
|
"eval_loss": 2.2184386253356934, |
|
"eval_runtime": 205.4094, |
|
"eval_samples_per_second": 9.011, |
|
"eval_steps_per_second": 4.508, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.7518489020366367, |
|
"grad_norm": 0.4428364336490631, |
|
"learning_rate": 2.944247910759097e-05, |
|
"loss": 2.1631, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 0.7527591307315963, |
|
"grad_norm": 0.43727412819862366, |
|
"learning_rate": 2.9238342996745817e-05, |
|
"loss": 2.1495, |
|
"step": 827 |
|
}, |
|
{ |
|
"epoch": 0.7536693594265559, |
|
"grad_norm": 0.45698437094688416, |
|
"learning_rate": 2.9034795786069436e-05, |
|
"loss": 2.1497, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 0.7545795881215155, |
|
"grad_norm": 0.423408567905426, |
|
"learning_rate": 2.8831839169543996e-05, |
|
"loss": 1.9607, |
|
"step": 829 |
|
}, |
|
{ |
|
"epoch": 0.7554898168164751, |
|
"grad_norm": 0.4666843116283417, |
|
"learning_rate": 2.862947483623659e-05, |
|
"loss": 2.1271, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.7564000455114347, |
|
"grad_norm": 0.46026623249053955, |
|
"learning_rate": 2.8427704470285144e-05, |
|
"loss": 2.1943, |
|
"step": 831 |
|
}, |
|
{ |
|
"epoch": 0.7573102742063944, |
|
"grad_norm": 0.459824800491333, |
|
"learning_rate": 2.8226529750884402e-05, |
|
"loss": 2.0793, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 0.758220502901354, |
|
"grad_norm": 0.4680033326148987, |
|
"learning_rate": 2.8025952352271958e-05, |
|
"loss": 2.1652, |
|
"step": 833 |
|
}, |
|
{ |
|
"epoch": 0.7591307315963136, |
|
"grad_norm": 0.47054019570350647, |
|
"learning_rate": 2.7825973943714335e-05, |
|
"loss": 2.1526, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 0.7600409602912732, |
|
"grad_norm": 0.465668648481369, |
|
"learning_rate": 2.7626596189492983e-05, |
|
"loss": 2.0476, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.7609511889862328, |
|
"grad_norm": 0.47716715931892395, |
|
"learning_rate": 2.7427820748890685e-05, |
|
"loss": 2.1511, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 0.7618614176811924, |
|
"grad_norm": 0.468532532453537, |
|
"learning_rate": 2.7229649276177503e-05, |
|
"loss": 2.1065, |
|
"step": 837 |
|
}, |
|
{ |
|
"epoch": 0.762771646376152, |
|
"grad_norm": 0.4883919358253479, |
|
"learning_rate": 2.7032083420597e-05, |
|
"loss": 2.2111, |
|
"step": 838 |
|
}, |
|
{ |
|
"epoch": 0.7636818750711116, |
|
"grad_norm": 0.5037781000137329, |
|
"learning_rate": 2.683512482635281e-05, |
|
"loss": 2.1824, |
|
"step": 839 |
|
}, |
|
{ |
|
"epoch": 0.7645921037660712, |
|
"grad_norm": 0.5097115635871887, |
|
"learning_rate": 2.6638775132594553e-05, |
|
"loss": 2.2818, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.7655023324610308, |
|
"grad_norm": 0.5003491640090942, |
|
"learning_rate": 2.6443035973404496e-05, |
|
"loss": 2.112, |
|
"step": 841 |
|
}, |
|
{ |
|
"epoch": 0.7664125611559904, |
|
"grad_norm": 0.5198303461074829, |
|
"learning_rate": 2.624790897778391e-05, |
|
"loss": 2.1864, |
|
"step": 842 |
|
}, |
|
{ |
|
"epoch": 0.76732278985095, |
|
"grad_norm": 0.5522372722625732, |
|
"learning_rate": 2.605339576963929e-05, |
|
"loss": 2.3857, |
|
"step": 843 |
|
}, |
|
{ |
|
"epoch": 0.7682330185459096, |
|
"grad_norm": 0.5393935441970825, |
|
"learning_rate": 2.585949796776912e-05, |
|
"loss": 2.2549, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 0.7691432472408692, |
|
"grad_norm": 0.573794424533844, |
|
"learning_rate": 2.5666217185850262e-05, |
|
"loss": 2.3236, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.7700534759358288, |
|
"grad_norm": 0.5689824819564819, |
|
"learning_rate": 2.5473555032424533e-05, |
|
"loss": 2.1463, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 0.7709637046307884, |
|
"grad_norm": 0.6545232534408569, |
|
"learning_rate": 2.528151311088537e-05, |
|
"loss": 2.3964, |
|
"step": 847 |
|
}, |
|
{ |
|
"epoch": 0.771873933325748, |
|
"grad_norm": 0.7613667845726013, |
|
"learning_rate": 2.50900930194644e-05, |
|
"loss": 2.7365, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 0.7727841620207077, |
|
"grad_norm": 0.8592699766159058, |
|
"learning_rate": 2.4899296351218227e-05, |
|
"loss": 2.3281, |
|
"step": 849 |
|
}, |
|
{ |
|
"epoch": 0.7736943907156673, |
|
"grad_norm": 1.6767549514770508, |
|
"learning_rate": 2.4709124694015116e-05, |
|
"loss": 2.4329, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.774604619410627, |
|
"grad_norm": 0.42721498012542725, |
|
"learning_rate": 2.451957963052185e-05, |
|
"loss": 2.4287, |
|
"step": 851 |
|
}, |
|
{ |
|
"epoch": 0.7755148481055866, |
|
"grad_norm": 0.4105132818222046, |
|
"learning_rate": 2.433066273819037e-05, |
|
"loss": 2.3069, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 0.7764250768005462, |
|
"grad_norm": 0.42730897665023804, |
|
"learning_rate": 2.4142375589244957e-05, |
|
"loss": 2.3786, |
|
"step": 853 |
|
}, |
|
{ |
|
"epoch": 0.7773353054955058, |
|
"grad_norm": 0.41130363941192627, |
|
"learning_rate": 2.3954719750668907e-05, |
|
"loss": 2.1378, |
|
"step": 854 |
|
}, |
|
{ |
|
"epoch": 0.7782455341904654, |
|
"grad_norm": 0.41424882411956787, |
|
"learning_rate": 2.3767696784191463e-05, |
|
"loss": 2.2526, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.779155762885425, |
|
"grad_norm": 0.42201122641563416, |
|
"learning_rate": 2.3581308246275103e-05, |
|
"loss": 2.3379, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 0.7800659915803846, |
|
"grad_norm": 0.4150107204914093, |
|
"learning_rate": 2.339555568810221e-05, |
|
"loss": 2.1722, |
|
"step": 857 |
|
}, |
|
{ |
|
"epoch": 0.7809762202753442, |
|
"grad_norm": 0.432271271944046, |
|
"learning_rate": 2.321044065556246e-05, |
|
"loss": 2.4875, |
|
"step": 858 |
|
}, |
|
{ |
|
"epoch": 0.7818864489703038, |
|
"grad_norm": 0.4255993068218231, |
|
"learning_rate": 2.302596468923981e-05, |
|
"loss": 2.3043, |
|
"step": 859 |
|
}, |
|
{ |
|
"epoch": 0.7827966776652634, |
|
"grad_norm": 0.4193149507045746, |
|
"learning_rate": 2.284212932439972e-05, |
|
"loss": 2.3238, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.783706906360223, |
|
"grad_norm": 0.3979727625846863, |
|
"learning_rate": 2.265893609097637e-05, |
|
"loss": 2.0908, |
|
"step": 861 |
|
}, |
|
{ |
|
"epoch": 0.7846171350551826, |
|
"grad_norm": 0.42481502890586853, |
|
"learning_rate": 2.247638651355991e-05, |
|
"loss": 2.3404, |
|
"step": 862 |
|
}, |
|
{ |
|
"epoch": 0.7855273637501422, |
|
"grad_norm": 0.41932496428489685, |
|
"learning_rate": 2.229448211138382e-05, |
|
"loss": 2.3529, |
|
"step": 863 |
|
}, |
|
{ |
|
"epoch": 0.7864375924451018, |
|
"grad_norm": 0.4188045263290405, |
|
"learning_rate": 2.211322439831218e-05, |
|
"loss": 2.1973, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 0.7873478211400614, |
|
"grad_norm": 0.41575223207473755, |
|
"learning_rate": 2.1932614882827197e-05, |
|
"loss": 2.2664, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.788258049835021, |
|
"grad_norm": 0.41103559732437134, |
|
"learning_rate": 2.1752655068016515e-05, |
|
"loss": 2.1176, |
|
"step": 866 |
|
}, |
|
{ |
|
"epoch": 0.7891682785299806, |
|
"grad_norm": 0.3994426727294922, |
|
"learning_rate": 2.1573346451560794e-05, |
|
"loss": 2.0824, |
|
"step": 867 |
|
}, |
|
{ |
|
"epoch": 0.7900785072249402, |
|
"grad_norm": 0.40480148792266846, |
|
"learning_rate": 2.139469052572127e-05, |
|
"loss": 1.9797, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 0.7909887359198998, |
|
"grad_norm": 0.4224672317504883, |
|
"learning_rate": 2.1216688777327154e-05, |
|
"loss": 2.0783, |
|
"step": 869 |
|
}, |
|
{ |
|
"epoch": 0.7918989646148595, |
|
"grad_norm": 0.4260886013507843, |
|
"learning_rate": 2.1039342687763586e-05, |
|
"loss": 2.203, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.7928091933098191, |
|
"grad_norm": 0.41183462738990784, |
|
"learning_rate": 2.0862653732958915e-05, |
|
"loss": 1.9724, |
|
"step": 871 |
|
}, |
|
{ |
|
"epoch": 0.7937194220047787, |
|
"grad_norm": 0.43447592854499817, |
|
"learning_rate": 2.0686623383372715e-05, |
|
"loss": 2.1632, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 0.7946296506997383, |
|
"grad_norm": 0.4297522008419037, |
|
"learning_rate": 2.051125310398353e-05, |
|
"loss": 2.0486, |
|
"step": 873 |
|
}, |
|
{ |
|
"epoch": 0.795539879394698, |
|
"grad_norm": 0.45072224736213684, |
|
"learning_rate": 2.03365443542764e-05, |
|
"loss": 2.1973, |
|
"step": 874 |
|
}, |
|
{ |
|
"epoch": 0.7964501080896575, |
|
"grad_norm": 0.4462050199508667, |
|
"learning_rate": 2.016249858823106e-05, |
|
"loss": 2.0274, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.7973603367846172, |
|
"grad_norm": 0.4606810212135315, |
|
"learning_rate": 1.998911725430963e-05, |
|
"loss": 2.1616, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 0.7982705654795768, |
|
"grad_norm": 0.44487303495407104, |
|
"learning_rate": 1.981640179544466e-05, |
|
"loss": 2.323, |
|
"step": 877 |
|
}, |
|
{ |
|
"epoch": 0.7991807941745364, |
|
"grad_norm": 0.45202627778053284, |
|
"learning_rate": 1.964435364902705e-05, |
|
"loss": 2.1361, |
|
"step": 878 |
|
}, |
|
{ |
|
"epoch": 0.800091022869496, |
|
"grad_norm": 0.44588690996170044, |
|
"learning_rate": 1.947297424689414e-05, |
|
"loss": 2.1173, |
|
"step": 879 |
|
}, |
|
{ |
|
"epoch": 0.8010012515644556, |
|
"grad_norm": 0.46819573640823364, |
|
"learning_rate": 1.93022650153178e-05, |
|
"loss": 2.0187, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.8019114802594152, |
|
"grad_norm": 0.44944408535957336, |
|
"learning_rate": 1.913222737499243e-05, |
|
"loss": 2.0103, |
|
"step": 881 |
|
}, |
|
{ |
|
"epoch": 0.8028217089543748, |
|
"grad_norm": 0.44194296002388, |
|
"learning_rate": 1.8962862741023423e-05, |
|
"loss": 1.9489, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 0.8037319376493344, |
|
"grad_norm": 0.4707835614681244, |
|
"learning_rate": 1.879417252291502e-05, |
|
"loss": 2.1982, |
|
"step": 883 |
|
}, |
|
{ |
|
"epoch": 0.804642166344294, |
|
"grad_norm": 0.4707585573196411, |
|
"learning_rate": 1.8626158124558858e-05, |
|
"loss": 2.1049, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 0.8055523950392536, |
|
"grad_norm": 0.4964425265789032, |
|
"learning_rate": 1.8458820944222255e-05, |
|
"loss": 2.2127, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.8064626237342132, |
|
"grad_norm": 0.4742617607116699, |
|
"learning_rate": 1.829216237453637e-05, |
|
"loss": 2.1019, |
|
"step": 886 |
|
}, |
|
{ |
|
"epoch": 0.8073728524291728, |
|
"grad_norm": 0.49655184149742126, |
|
"learning_rate": 1.8126183802484865e-05, |
|
"loss": 2.2403, |
|
"step": 887 |
|
}, |
|
{ |
|
"epoch": 0.8082830811241324, |
|
"grad_norm": 0.4954749643802643, |
|
"learning_rate": 1.7960886609392214e-05, |
|
"loss": 2.0321, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 0.8091933098190921, |
|
"grad_norm": 0.4694468379020691, |
|
"learning_rate": 1.7796272170912253e-05, |
|
"loss": 1.817, |
|
"step": 889 |
|
}, |
|
{ |
|
"epoch": 0.8101035385140517, |
|
"grad_norm": 0.5026715397834778, |
|
"learning_rate": 1.763234185701673e-05, |
|
"loss": 2.2038, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.8110137672090113, |
|
"grad_norm": 0.5050073862075806, |
|
"learning_rate": 1.7469097031983893e-05, |
|
"loss": 2.0861, |
|
"step": 891 |
|
}, |
|
{ |
|
"epoch": 0.8119239959039709, |
|
"grad_norm": 0.5078185796737671, |
|
"learning_rate": 1.730653905438714e-05, |
|
"loss": 2.0672, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 0.8128342245989305, |
|
"grad_norm": 0.525215744972229, |
|
"learning_rate": 1.7144669277083712e-05, |
|
"loss": 2.1502, |
|
"step": 893 |
|
}, |
|
{ |
|
"epoch": 0.8137444532938901, |
|
"grad_norm": 0.5429519414901733, |
|
"learning_rate": 1.6983489047203483e-05, |
|
"loss": 2.0935, |
|
"step": 894 |
|
}, |
|
{ |
|
"epoch": 0.8146546819888497, |
|
"grad_norm": 0.5544317960739136, |
|
"learning_rate": 1.6822999706137567e-05, |
|
"loss": 2.0943, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.8155649106838093, |
|
"grad_norm": 0.6273201107978821, |
|
"learning_rate": 1.6663202589527473e-05, |
|
"loss": 2.3608, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 0.8164751393787689, |
|
"grad_norm": 0.7101454734802246, |
|
"learning_rate": 1.6504099027253706e-05, |
|
"loss": 2.4168, |
|
"step": 897 |
|
}, |
|
{ |
|
"epoch": 0.8173853680737285, |
|
"grad_norm": 0.7550842761993408, |
|
"learning_rate": 1.634569034342476e-05, |
|
"loss": 2.5798, |
|
"step": 898 |
|
}, |
|
{ |
|
"epoch": 0.8182955967686881, |
|
"grad_norm": 0.8533863425254822, |
|
"learning_rate": 1.6187977856366253e-05, |
|
"loss": 2.5575, |
|
"step": 899 |
|
}, |
|
{ |
|
"epoch": 0.8192058254636477, |
|
"grad_norm": 1.34774911403656, |
|
"learning_rate": 1.6030962878609725e-05, |
|
"loss": 2.4134, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.8201160541586073, |
|
"grad_norm": 0.4584032893180847, |
|
"learning_rate": 1.587464671688187e-05, |
|
"loss": 2.4781, |
|
"step": 901 |
|
}, |
|
{ |
|
"epoch": 0.8210262828535669, |
|
"grad_norm": 0.43342748284339905, |
|
"learning_rate": 1.5719030672093717e-05, |
|
"loss": 2.3685, |
|
"step": 902 |
|
}, |
|
{ |
|
"epoch": 0.8219365115485265, |
|
"grad_norm": 0.4225307106971741, |
|
"learning_rate": 1.5564116039329545e-05, |
|
"loss": 2.2022, |
|
"step": 903 |
|
}, |
|
{ |
|
"epoch": 0.8228467402434861, |
|
"grad_norm": 0.43026039004325867, |
|
"learning_rate": 1.5409904107836358e-05, |
|
"loss": 2.2817, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 0.8237569689384457, |
|
"grad_norm": 0.4114493131637573, |
|
"learning_rate": 1.5256396161013075e-05, |
|
"loss": 2.3298, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.8246671976334053, |
|
"grad_norm": 0.42313718795776367, |
|
"learning_rate": 1.5103593476399791e-05, |
|
"loss": 2.3211, |
|
"step": 906 |
|
}, |
|
{ |
|
"epoch": 0.825577426328365, |
|
"grad_norm": 0.4246841371059418, |
|
"learning_rate": 1.495149732566723e-05, |
|
"loss": 2.2385, |
|
"step": 907 |
|
}, |
|
{ |
|
"epoch": 0.8264876550233247, |
|
"grad_norm": 0.4131985008716583, |
|
"learning_rate": 1.4800108974606119e-05, |
|
"loss": 2.2873, |
|
"step": 908 |
|
}, |
|
{ |
|
"epoch": 0.8273978837182843, |
|
"grad_norm": 0.42265599966049194, |
|
"learning_rate": 1.4649429683116644e-05, |
|
"loss": 2.1486, |
|
"step": 909 |
|
}, |
|
{ |
|
"epoch": 0.8283081124132439, |
|
"grad_norm": 0.4338424801826477, |
|
"learning_rate": 1.4499460705197998e-05, |
|
"loss": 2.2365, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.8292183411082035, |
|
"grad_norm": 0.4278540015220642, |
|
"learning_rate": 1.4350203288937936e-05, |
|
"loss": 2.36, |
|
"step": 911 |
|
}, |
|
{ |
|
"epoch": 0.8301285698031631, |
|
"grad_norm": 0.41379448771476746, |
|
"learning_rate": 1.4201658676502294e-05, |
|
"loss": 2.184, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 0.8310387984981227, |
|
"grad_norm": 0.42351198196411133, |
|
"learning_rate": 1.4053828104124867e-05, |
|
"loss": 2.2505, |
|
"step": 913 |
|
}, |
|
{ |
|
"epoch": 0.8319490271930823, |
|
"grad_norm": 0.40783679485321045, |
|
"learning_rate": 1.3906712802096933e-05, |
|
"loss": 2.0255, |
|
"step": 914 |
|
}, |
|
{ |
|
"epoch": 0.8328592558880419, |
|
"grad_norm": 0.4174416661262512, |
|
"learning_rate": 1.3760313994757001e-05, |
|
"loss": 2.2376, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.8337694845830015, |
|
"grad_norm": 0.41884645819664, |
|
"learning_rate": 1.361463290048085e-05, |
|
"loss": 2.0206, |
|
"step": 916 |
|
}, |
|
{ |
|
"epoch": 0.8346797132779611, |
|
"grad_norm": 0.399498850107193, |
|
"learning_rate": 1.3469670731671046e-05, |
|
"loss": 2.063, |
|
"step": 917 |
|
}, |
|
{ |
|
"epoch": 0.8355899419729207, |
|
"grad_norm": 0.40431055426597595, |
|
"learning_rate": 1.3325428694747177e-05, |
|
"loss": 2.0053, |
|
"step": 918 |
|
}, |
|
{ |
|
"epoch": 0.8365001706678803, |
|
"grad_norm": 0.40479356050491333, |
|
"learning_rate": 1.3181907990135622e-05, |
|
"loss": 2.0693, |
|
"step": 919 |
|
}, |
|
{ |
|
"epoch": 0.8374103993628399, |
|
"grad_norm": 0.4056653678417206, |
|
"learning_rate": 1.3039109812259598e-05, |
|
"loss": 2.0361, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.8383206280577995, |
|
"grad_norm": 0.4257088005542755, |
|
"learning_rate": 1.2897035349529263e-05, |
|
"loss": 2.0589, |
|
"step": 921 |
|
}, |
|
{ |
|
"epoch": 0.8392308567527591, |
|
"grad_norm": 0.43024080991744995, |
|
"learning_rate": 1.2755685784331783e-05, |
|
"loss": 2.0419, |
|
"step": 922 |
|
}, |
|
{ |
|
"epoch": 0.8401410854477187, |
|
"grad_norm": 0.42889195680618286, |
|
"learning_rate": 1.2615062293021507e-05, |
|
"loss": 2.0515, |
|
"step": 923 |
|
}, |
|
{ |
|
"epoch": 0.8410513141426783, |
|
"grad_norm": 0.4491952061653137, |
|
"learning_rate": 1.2475166045910159e-05, |
|
"loss": 2.2535, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 0.8419615428376379, |
|
"grad_norm": 0.43797358870506287, |
|
"learning_rate": 1.2335998207257137e-05, |
|
"loss": 2.1338, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.8428717715325975, |
|
"grad_norm": 0.4491622745990753, |
|
"learning_rate": 1.2197559935259795e-05, |
|
"loss": 2.2059, |
|
"step": 926 |
|
}, |
|
{ |
|
"epoch": 0.8437820002275572, |
|
"grad_norm": 0.43628188967704773, |
|
"learning_rate": 1.20598523820438e-05, |
|
"loss": 1.8784, |
|
"step": 927 |
|
}, |
|
{ |
|
"epoch": 0.8446922289225168, |
|
"grad_norm": 0.45739004015922546, |
|
"learning_rate": 1.1922876693653585e-05, |
|
"loss": 2.0433, |
|
"step": 928 |
|
}, |
|
{ |
|
"epoch": 0.8456024576174764, |
|
"grad_norm": 0.44873446226119995, |
|
"learning_rate": 1.1786634010042719e-05, |
|
"loss": 1.9578, |
|
"step": 929 |
|
}, |
|
{ |
|
"epoch": 0.846512686312436, |
|
"grad_norm": 0.43957433104515076, |
|
"learning_rate": 1.1651125465064516e-05, |
|
"loss": 2.0078, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.8474229150073956, |
|
"grad_norm": 0.4639342129230499, |
|
"learning_rate": 1.1516352186462586e-05, |
|
"loss": 2.0714, |
|
"step": 931 |
|
}, |
|
{ |
|
"epoch": 0.8483331437023552, |
|
"grad_norm": 0.44638022780418396, |
|
"learning_rate": 1.13823152958614e-05, |
|
"loss": 1.8991, |
|
"step": 932 |
|
}, |
|
{ |
|
"epoch": 0.8492433723973148, |
|
"grad_norm": 0.4596819579601288, |
|
"learning_rate": 1.1249015908756998e-05, |
|
"loss": 1.9595, |
|
"step": 933 |
|
}, |
|
{ |
|
"epoch": 0.8501536010922744, |
|
"grad_norm": 0.47656434774398804, |
|
"learning_rate": 1.1116455134507664e-05, |
|
"loss": 2.0788, |
|
"step": 934 |
|
}, |
|
{ |
|
"epoch": 0.851063829787234, |
|
"grad_norm": 0.4645254611968994, |
|
"learning_rate": 1.098463407632474e-05, |
|
"loss": 2.0703, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.8519740584821937, |
|
"grad_norm": 0.4659541845321655, |
|
"learning_rate": 1.0853553831263418e-05, |
|
"loss": 2.0804, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 0.8528842871771533, |
|
"grad_norm": 0.4771886467933655, |
|
"learning_rate": 1.0723215490213634e-05, |
|
"loss": 2.1124, |
|
"step": 937 |
|
}, |
|
{ |
|
"epoch": 0.8537945158721129, |
|
"grad_norm": 0.49211612343788147, |
|
"learning_rate": 1.0593620137890948e-05, |
|
"loss": 2.2221, |
|
"step": 938 |
|
}, |
|
{ |
|
"epoch": 0.8547047445670725, |
|
"grad_norm": 0.5174618363380432, |
|
"learning_rate": 1.0464768852827545e-05, |
|
"loss": 2.1684, |
|
"step": 939 |
|
}, |
|
{ |
|
"epoch": 0.8556149732620321, |
|
"grad_norm": 0.5098733305931091, |
|
"learning_rate": 1.0336662707363287e-05, |
|
"loss": 2.103, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.8565252019569917, |
|
"grad_norm": 0.5197715163230896, |
|
"learning_rate": 1.0209302767636664e-05, |
|
"loss": 2.2107, |
|
"step": 941 |
|
}, |
|
{ |
|
"epoch": 0.8574354306519513, |
|
"grad_norm": 0.547512412071228, |
|
"learning_rate": 1.0082690093576163e-05, |
|
"loss": 2.2448, |
|
"step": 942 |
|
}, |
|
{ |
|
"epoch": 0.8583456593469109, |
|
"grad_norm": 0.5418568849563599, |
|
"learning_rate": 9.95682573889114e-06, |
|
"loss": 2.2423, |
|
"step": 943 |
|
}, |
|
{ |
|
"epoch": 0.8592558880418705, |
|
"grad_norm": 0.5369839072227478, |
|
"learning_rate": 9.831710751063283e-06, |
|
"loss": 1.9788, |
|
"step": 944 |
|
}, |
|
{ |
|
"epoch": 0.8601661167368301, |
|
"grad_norm": 0.573844313621521, |
|
"learning_rate": 9.707346171337894e-06, |
|
"loss": 2.2906, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.8610763454317898, |
|
"grad_norm": 0.6142247915267944, |
|
"learning_rate": 9.583733034714981e-06, |
|
"loss": 2.3744, |
|
"step": 946 |
|
}, |
|
{ |
|
"epoch": 0.8619865741267494, |
|
"grad_norm": 0.6646602153778076, |
|
"learning_rate": 9.460872369940955e-06, |
|
"loss": 2.4641, |
|
"step": 947 |
|
}, |
|
{ |
|
"epoch": 0.862896802821709, |
|
"grad_norm": 0.727783739566803, |
|
"learning_rate": 9.338765199499854e-06, |
|
"loss": 2.4612, |
|
"step": 948 |
|
}, |
|
{ |
|
"epoch": 0.8638070315166686, |
|
"grad_norm": 0.851578950881958, |
|
"learning_rate": 9.217412539604942e-06, |
|
"loss": 2.6441, |
|
"step": 949 |
|
}, |
|
{ |
|
"epoch": 0.8647172602116282, |
|
"grad_norm": 1.461125373840332, |
|
"learning_rate": 9.096815400190172e-06, |
|
"loss": 2.4248, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.8656274889065878, |
|
"grad_norm": 0.4408392608165741, |
|
"learning_rate": 8.97697478490188e-06, |
|
"loss": 2.5431, |
|
"step": 951 |
|
}, |
|
{ |
|
"epoch": 0.8665377176015474, |
|
"grad_norm": 0.4102359414100647, |
|
"learning_rate": 8.857891691090337e-06, |
|
"loss": 2.3448, |
|
"step": 952 |
|
}, |
|
{ |
|
"epoch": 0.867447946296507, |
|
"grad_norm": 0.4374777674674988, |
|
"learning_rate": 8.739567109801494e-06, |
|
"loss": 2.3647, |
|
"step": 953 |
|
}, |
|
{ |
|
"epoch": 0.8683581749914666, |
|
"grad_norm": 0.40009114146232605, |
|
"learning_rate": 8.62200202576875e-06, |
|
"loss": 2.2401, |
|
"step": 954 |
|
}, |
|
{ |
|
"epoch": 0.8692684036864262, |
|
"grad_norm": 0.42013484239578247, |
|
"learning_rate": 8.505197417404687e-06, |
|
"loss": 2.1772, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.8701786323813858, |
|
"grad_norm": 0.43588119745254517, |
|
"learning_rate": 8.38915425679304e-06, |
|
"loss": 2.4605, |
|
"step": 956 |
|
}, |
|
{ |
|
"epoch": 0.8710888610763454, |
|
"grad_norm": 0.4295041561126709, |
|
"learning_rate": 8.273873509680519e-06, |
|
"loss": 2.4302, |
|
"step": 957 |
|
}, |
|
{ |
|
"epoch": 0.871999089771305, |
|
"grad_norm": 0.4430733621120453, |
|
"learning_rate": 8.15935613546872e-06, |
|
"loss": 2.3014, |
|
"step": 958 |
|
}, |
|
{ |
|
"epoch": 0.8729093184662646, |
|
"grad_norm": 0.4275224804878235, |
|
"learning_rate": 8.045603087206388e-06, |
|
"loss": 2.251, |
|
"step": 959 |
|
}, |
|
{ |
|
"epoch": 0.8738195471612242, |
|
"grad_norm": 0.4218734800815582, |
|
"learning_rate": 7.932615311581126e-06, |
|
"loss": 2.2841, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.8747297758561838, |
|
"grad_norm": 0.4275785982608795, |
|
"learning_rate": 7.820393748911791e-06, |
|
"loss": 2.2751, |
|
"step": 961 |
|
}, |
|
{ |
|
"epoch": 0.8756400045511434, |
|
"grad_norm": 0.40714067220687866, |
|
"learning_rate": 7.708939333140642e-06, |
|
"loss": 2.2023, |
|
"step": 962 |
|
}, |
|
{ |
|
"epoch": 0.876550233246103, |
|
"grad_norm": 0.4284750521183014, |
|
"learning_rate": 7.598252991825372e-06, |
|
"loss": 2.1991, |
|
"step": 963 |
|
}, |
|
{ |
|
"epoch": 0.8774604619410626, |
|
"grad_norm": 0.40348193049430847, |
|
"learning_rate": 7.488335646131628e-06, |
|
"loss": 2.1214, |
|
"step": 964 |
|
}, |
|
{ |
|
"epoch": 0.8783706906360224, |
|
"grad_norm": 0.4067203998565674, |
|
"learning_rate": 7.3791882108251945e-06, |
|
"loss": 2.0977, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.879280919330982, |
|
"grad_norm": 0.40969371795654297, |
|
"learning_rate": 7.270811594264437e-06, |
|
"loss": 2.1751, |
|
"step": 966 |
|
}, |
|
{ |
|
"epoch": 0.8801911480259416, |
|
"grad_norm": 0.39071908593177795, |
|
"learning_rate": 7.163206698392744e-06, |
|
"loss": 2.0464, |
|
"step": 967 |
|
}, |
|
{ |
|
"epoch": 0.8811013767209012, |
|
"grad_norm": 0.4038424789905548, |
|
"learning_rate": 7.056374418730971e-06, |
|
"loss": 2.1137, |
|
"step": 968 |
|
}, |
|
{ |
|
"epoch": 0.8820116054158608, |
|
"grad_norm": 0.38801443576812744, |
|
"learning_rate": 6.950315644370075e-06, |
|
"loss": 1.883, |
|
"step": 969 |
|
}, |
|
{ |
|
"epoch": 0.8829218341108204, |
|
"grad_norm": 0.3895006477832794, |
|
"learning_rate": 6.845031257963619e-06, |
|
"loss": 2.0169, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.88383206280578, |
|
"grad_norm": 0.413171648979187, |
|
"learning_rate": 6.740522135720517e-06, |
|
"loss": 2.2054, |
|
"step": 971 |
|
}, |
|
{ |
|
"epoch": 0.8847422915007396, |
|
"grad_norm": 0.4204188585281372, |
|
"learning_rate": 6.636789147397637e-06, |
|
"loss": 2.1765, |
|
"step": 972 |
|
}, |
|
{ |
|
"epoch": 0.8856525201956992, |
|
"grad_norm": 0.4209098517894745, |
|
"learning_rate": 6.533833156292679e-06, |
|
"loss": 1.9617, |
|
"step": 973 |
|
}, |
|
{ |
|
"epoch": 0.8865627488906588, |
|
"grad_norm": 0.4256611168384552, |
|
"learning_rate": 6.431655019236948e-06, |
|
"loss": 2.108, |
|
"step": 974 |
|
}, |
|
{ |
|
"epoch": 0.8874729775856184, |
|
"grad_norm": 0.43669816851615906, |
|
"learning_rate": 6.3302555865880965e-06, |
|
"loss": 2.0991, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.888383206280578, |
|
"grad_norm": 0.44833648204803467, |
|
"learning_rate": 6.229635702223324e-06, |
|
"loss": 2.2335, |
|
"step": 976 |
|
}, |
|
{ |
|
"epoch": 0.8892934349755376, |
|
"grad_norm": 0.45070621371269226, |
|
"learning_rate": 6.129796203532057e-06, |
|
"loss": 2.2487, |
|
"step": 977 |
|
}, |
|
{ |
|
"epoch": 0.8902036636704972, |
|
"grad_norm": 0.4609052538871765, |
|
"learning_rate": 6.030737921409169e-06, |
|
"loss": 2.1147, |
|
"step": 978 |
|
}, |
|
{ |
|
"epoch": 0.8911138923654568, |
|
"grad_norm": 0.4470416307449341, |
|
"learning_rate": 5.932461680248014e-06, |
|
"loss": 2.0615, |
|
"step": 979 |
|
}, |
|
{ |
|
"epoch": 0.8920241210604164, |
|
"grad_norm": 0.42171233892440796, |
|
"learning_rate": 5.834968297933541e-06, |
|
"loss": 2.0669, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.892934349755376, |
|
"grad_norm": 0.4385877251625061, |
|
"learning_rate": 5.738258585835532e-06, |
|
"loss": 1.9846, |
|
"step": 981 |
|
}, |
|
{ |
|
"epoch": 0.8938445784503356, |
|
"grad_norm": 0.4574371576309204, |
|
"learning_rate": 5.6423333488018095e-06, |
|
"loss": 2.114, |
|
"step": 982 |
|
}, |
|
{ |
|
"epoch": 0.8947548071452952, |
|
"grad_norm": 0.46896499395370483, |
|
"learning_rate": 5.547193385151561e-06, |
|
"loss": 2.0444, |
|
"step": 983 |
|
}, |
|
{ |
|
"epoch": 0.8956650358402549, |
|
"grad_norm": 0.45737412571907043, |
|
"learning_rate": 5.45283948666866e-06, |
|
"loss": 2.0976, |
|
"step": 984 |
|
}, |
|
{ |
|
"epoch": 0.8965752645352145, |
|
"grad_norm": 0.47739726305007935, |
|
"learning_rate": 5.359272438595153e-06, |
|
"loss": 2.1393, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.8974854932301741, |
|
"grad_norm": 0.47124338150024414, |
|
"learning_rate": 5.266493019624663e-06, |
|
"loss": 2.0509, |
|
"step": 986 |
|
}, |
|
{ |
|
"epoch": 0.8983957219251337, |
|
"grad_norm": 0.4660322070121765, |
|
"learning_rate": 5.1745020018958866e-06, |
|
"loss": 1.9704, |
|
"step": 987 |
|
}, |
|
{ |
|
"epoch": 0.8993059506200933, |
|
"grad_norm": 0.48330241441726685, |
|
"learning_rate": 5.083300150986259e-06, |
|
"loss": 2.021, |
|
"step": 988 |
|
}, |
|
{ |
|
"epoch": 0.9002161793150529, |
|
"grad_norm": 0.51470547914505, |
|
"learning_rate": 4.992888225905468e-06, |
|
"loss": 2.1097, |
|
"step": 989 |
|
}, |
|
{ |
|
"epoch": 0.9011264080100125, |
|
"grad_norm": 0.516373336315155, |
|
"learning_rate": 4.903266979089249e-06, |
|
"loss": 2.1694, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.9020366367049721, |
|
"grad_norm": 0.5256400108337402, |
|
"learning_rate": 4.8144371563930476e-06, |
|
"loss": 2.2843, |
|
"step": 991 |
|
}, |
|
{ |
|
"epoch": 0.9029468653999317, |
|
"grad_norm": 0.5559744238853455, |
|
"learning_rate": 4.726399497085832e-06, |
|
"loss": 2.2733, |
|
"step": 992 |
|
}, |
|
{ |
|
"epoch": 0.9038570940948913, |
|
"grad_norm": 0.5462202429771423, |
|
"learning_rate": 4.6391547338439536e-06, |
|
"loss": 2.1758, |
|
"step": 993 |
|
}, |
|
{ |
|
"epoch": 0.904767322789851, |
|
"grad_norm": 0.5769087672233582, |
|
"learning_rate": 4.552703592745033e-06, |
|
"loss": 2.2552, |
|
"step": 994 |
|
}, |
|
{ |
|
"epoch": 0.9056775514848106, |
|
"grad_norm": 0.5632253289222717, |
|
"learning_rate": 4.467046793261931e-06, |
|
"loss": 2.2402, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.9065877801797702, |
|
"grad_norm": 0.610163688659668, |
|
"learning_rate": 4.3821850482567595e-06, |
|
"loss": 2.4484, |
|
"step": 996 |
|
}, |
|
{ |
|
"epoch": 0.9074980088747298, |
|
"grad_norm": 0.6236492395401001, |
|
"learning_rate": 4.298119063974914e-06, |
|
"loss": 2.1914, |
|
"step": 997 |
|
}, |
|
{ |
|
"epoch": 0.9084082375696894, |
|
"grad_norm": 0.7362584471702576, |
|
"learning_rate": 4.214849540039267e-06, |
|
"loss": 2.5582, |
|
"step": 998 |
|
}, |
|
{ |
|
"epoch": 0.909318466264649, |
|
"grad_norm": 0.8780522346496582, |
|
"learning_rate": 4.132377169444279e-06, |
|
"loss": 2.5269, |
|
"step": 999 |
|
}, |
|
{ |
|
"epoch": 0.9102286949596086, |
|
"grad_norm": 1.4850975275039673, |
|
"learning_rate": 4.050702638550275e-06, |
|
"loss": 2.3857, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.9111389236545682, |
|
"grad_norm": 0.4350475072860718, |
|
"learning_rate": 3.969826627077655e-06, |
|
"loss": 2.5653, |
|
"step": 1001 |
|
}, |
|
{ |
|
"epoch": 0.9120491523495278, |
|
"grad_norm": 0.44277626276016235, |
|
"learning_rate": 3.889749808101395e-06, |
|
"loss": 2.3969, |
|
"step": 1002 |
|
}, |
|
{ |
|
"epoch": 0.9129593810444874, |
|
"grad_norm": 0.44005268812179565, |
|
"learning_rate": 3.810472848045266e-06, |
|
"loss": 2.6065, |
|
"step": 1003 |
|
}, |
|
{ |
|
"epoch": 0.9138696097394471, |
|
"grad_norm": 0.41925248503685, |
|
"learning_rate": 3.7319964066763858e-06, |
|
"loss": 2.3878, |
|
"step": 1004 |
|
}, |
|
{ |
|
"epoch": 0.9147798384344067, |
|
"grad_norm": 0.4320535957813263, |
|
"learning_rate": 3.6543211370997587e-06, |
|
"loss": 2.3829, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.9156900671293663, |
|
"grad_norm": 0.43817150592803955, |
|
"learning_rate": 3.5774476857527107e-06, |
|
"loss": 2.3854, |
|
"step": 1006 |
|
}, |
|
{ |
|
"epoch": 0.9166002958243259, |
|
"grad_norm": 0.42210131883621216, |
|
"learning_rate": 3.5013766923996604e-06, |
|
"loss": 2.2874, |
|
"step": 1007 |
|
}, |
|
{ |
|
"epoch": 0.9175105245192855, |
|
"grad_norm": 0.41610825061798096, |
|
"learning_rate": 3.426108790126681e-06, |
|
"loss": 2.3301, |
|
"step": 1008 |
|
}, |
|
{ |
|
"epoch": 0.9184207532142451, |
|
"grad_norm": 0.42343541979789734, |
|
"learning_rate": 3.3516446053363015e-06, |
|
"loss": 2.2083, |
|
"step": 1009 |
|
}, |
|
{ |
|
"epoch": 0.9193309819092047, |
|
"grad_norm": 0.4323045015335083, |
|
"learning_rate": 3.2779847577422697e-06, |
|
"loss": 2.2401, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.9202412106041643, |
|
"grad_norm": 0.4198078513145447, |
|
"learning_rate": 3.2051298603643753e-06, |
|
"loss": 2.0988, |
|
"step": 1011 |
|
}, |
|
{ |
|
"epoch": 0.9211514392991239, |
|
"grad_norm": 0.4277539551258087, |
|
"learning_rate": 3.133080519523368e-06, |
|
"loss": 2.3482, |
|
"step": 1012 |
|
}, |
|
{ |
|
"epoch": 0.9220616679940835, |
|
"grad_norm": 0.42749837040901184, |
|
"learning_rate": 3.0618373348359264e-06, |
|
"loss": 2.3242, |
|
"step": 1013 |
|
}, |
|
{ |
|
"epoch": 0.9229718966890431, |
|
"grad_norm": 0.4157456159591675, |
|
"learning_rate": 2.991400899209651e-06, |
|
"loss": 2.11, |
|
"step": 1014 |
|
}, |
|
{ |
|
"epoch": 0.9238821253840027, |
|
"grad_norm": 0.41514283418655396, |
|
"learning_rate": 2.921771798838069e-06, |
|
"loss": 2.0979, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.9247923540789623, |
|
"grad_norm": 0.4146190285682678, |
|
"learning_rate": 2.852950613195915e-06, |
|
"loss": 2.1057, |
|
"step": 1016 |
|
}, |
|
{ |
|
"epoch": 0.9257025827739219, |
|
"grad_norm": 0.4031788110733032, |
|
"learning_rate": 2.784937915034169e-06, |
|
"loss": 2.1094, |
|
"step": 1017 |
|
}, |
|
{ |
|
"epoch": 0.9266128114688815, |
|
"grad_norm": 0.4135347604751587, |
|
"learning_rate": 2.717734270375272e-06, |
|
"loss": 2.2154, |
|
"step": 1018 |
|
}, |
|
{ |
|
"epoch": 0.9275230401638411, |
|
"grad_norm": 0.40153443813323975, |
|
"learning_rate": 2.6513402385085704e-06, |
|
"loss": 2.0342, |
|
"step": 1019 |
|
}, |
|
{ |
|
"epoch": 0.9284332688588007, |
|
"grad_norm": 0.4011882543563843, |
|
"learning_rate": 2.585756371985493e-06, |
|
"loss": 1.9751, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.9293434975537603, |
|
"grad_norm": 0.4032374322414398, |
|
"learning_rate": 2.520983216615047e-06, |
|
"loss": 2.0868, |
|
"step": 1021 |
|
}, |
|
{ |
|
"epoch": 0.9302537262487199, |
|
"grad_norm": 0.4052782952785492, |
|
"learning_rate": 2.4570213114592954e-06, |
|
"loss": 2.0716, |
|
"step": 1022 |
|
}, |
|
{ |
|
"epoch": 0.9311639549436797, |
|
"grad_norm": 0.4261015057563782, |
|
"learning_rate": 2.393871188828767e-06, |
|
"loss": 2.1153, |
|
"step": 1023 |
|
}, |
|
{ |
|
"epoch": 0.9320741836386393, |
|
"grad_norm": 0.4141393303871155, |
|
"learning_rate": 2.3315333742780942e-06, |
|
"loss": 2.0839, |
|
"step": 1024 |
|
}, |
|
{ |
|
"epoch": 0.9329844123335989, |
|
"grad_norm": 0.41185298562049866, |
|
"learning_rate": 2.270008386601685e-06, |
|
"loss": 2.0686, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.9338946410285585, |
|
"grad_norm": 0.41932380199432373, |
|
"learning_rate": 2.2092967378292915e-06, |
|
"loss": 2.0688, |
|
"step": 1026 |
|
}, |
|
{ |
|
"epoch": 0.9348048697235181, |
|
"grad_norm": 0.430480033159256, |
|
"learning_rate": 2.1493989332218468e-06, |
|
"loss": 2.1202, |
|
"step": 1027 |
|
}, |
|
{ |
|
"epoch": 0.9357150984184777, |
|
"grad_norm": 0.4182969629764557, |
|
"learning_rate": 2.0903154712672237e-06, |
|
"loss": 1.8457, |
|
"step": 1028 |
|
}, |
|
{ |
|
"epoch": 0.9366253271134373, |
|
"grad_norm": 0.44270989298820496, |
|
"learning_rate": 2.032046843676061e-06, |
|
"loss": 2.2296, |
|
"step": 1029 |
|
}, |
|
{ |
|
"epoch": 0.9375355558083969, |
|
"grad_norm": 0.4312398433685303, |
|
"learning_rate": 1.974593535377722e-06, |
|
"loss": 1.9802, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.9384457845033565, |
|
"grad_norm": 0.4557861387729645, |
|
"learning_rate": 1.917956024516243e-06, |
|
"loss": 1.9306, |
|
"step": 1031 |
|
}, |
|
{ |
|
"epoch": 0.9393560131983161, |
|
"grad_norm": 0.4993920624256134, |
|
"learning_rate": 1.8621347824462787e-06, |
|
"loss": 2.275, |
|
"step": 1032 |
|
}, |
|
{ |
|
"epoch": 0.9402662418932757, |
|
"grad_norm": 0.4622466266155243, |
|
"learning_rate": 1.8071302737293295e-06, |
|
"loss": 2.1966, |
|
"step": 1033 |
|
}, |
|
{ |
|
"epoch": 0.9411764705882353, |
|
"grad_norm": 0.4710349142551422, |
|
"learning_rate": 1.752942956129744e-06, |
|
"loss": 2.1353, |
|
"step": 1034 |
|
}, |
|
{ |
|
"epoch": 0.9420866992831949, |
|
"grad_norm": 0.4602985680103302, |
|
"learning_rate": 1.6995732806109554e-06, |
|
"loss": 2.0855, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.9429969279781545, |
|
"grad_norm": 0.4736422002315521, |
|
"learning_rate": 1.6470216913317626e-06, |
|
"loss": 2.081, |
|
"step": 1036 |
|
}, |
|
{ |
|
"epoch": 0.9439071566731141, |
|
"grad_norm": 0.4875909984111786, |
|
"learning_rate": 1.5952886256425547e-06, |
|
"loss": 2.1607, |
|
"step": 1037 |
|
}, |
|
{ |
|
"epoch": 0.9448173853680737, |
|
"grad_norm": 0.5093516111373901, |
|
"learning_rate": 1.5443745140817366e-06, |
|
"loss": 2.2025, |
|
"step": 1038 |
|
}, |
|
{ |
|
"epoch": 0.9457276140630333, |
|
"grad_norm": 0.5034651160240173, |
|
"learning_rate": 1.4942797803721543e-06, |
|
"loss": 1.9985, |
|
"step": 1039 |
|
}, |
|
{ |
|
"epoch": 0.9466378427579929, |
|
"grad_norm": 0.5074111819267273, |
|
"learning_rate": 1.4450048414174854e-06, |
|
"loss": 2.1175, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.9475480714529525, |
|
"grad_norm": 0.5183742046356201, |
|
"learning_rate": 1.3965501072988663e-06, |
|
"loss": 2.0718, |
|
"step": 1041 |
|
}, |
|
{ |
|
"epoch": 0.9484583001479122, |
|
"grad_norm": 0.5284718871116638, |
|
"learning_rate": 1.348915981271437e-06, |
|
"loss": 2.1586, |
|
"step": 1042 |
|
}, |
|
{ |
|
"epoch": 0.9493685288428718, |
|
"grad_norm": 0.545464813709259, |
|
"learning_rate": 1.3021028597609675e-06, |
|
"loss": 2.2445, |
|
"step": 1043 |
|
}, |
|
{ |
|
"epoch": 0.9502787575378314, |
|
"grad_norm": 0.5713001489639282, |
|
"learning_rate": 1.2561111323605712e-06, |
|
"loss": 2.1888, |
|
"step": 1044 |
|
}, |
|
{ |
|
"epoch": 0.951188986232791, |
|
"grad_norm": 0.5774447321891785, |
|
"learning_rate": 1.2109411818274852e-06, |
|
"loss": 2.2029, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.9520992149277506, |
|
"grad_norm": 0.6209971308708191, |
|
"learning_rate": 1.1665933840798838e-06, |
|
"loss": 2.1735, |
|
"step": 1046 |
|
}, |
|
{ |
|
"epoch": 0.9530094436227102, |
|
"grad_norm": 0.6675162315368652, |
|
"learning_rate": 1.1230681081936923e-06, |
|
"loss": 2.4231, |
|
"step": 1047 |
|
}, |
|
{ |
|
"epoch": 0.9539196723176698, |
|
"grad_norm": 0.7409051060676575, |
|
"learning_rate": 1.0803657163995895e-06, |
|
"loss": 2.45, |
|
"step": 1048 |
|
}, |
|
{ |
|
"epoch": 0.9548299010126294, |
|
"grad_norm": 0.8856377601623535, |
|
"learning_rate": 1.0384865640799435e-06, |
|
"loss": 2.4769, |
|
"step": 1049 |
|
}, |
|
{ |
|
"epoch": 0.955740129707589, |
|
"grad_norm": 1.4897712469100952, |
|
"learning_rate": 9.974309997658915e-07, |
|
"loss": 2.6822, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.9566503584025486, |
|
"grad_norm": 0.4162799119949341, |
|
"learning_rate": 9.57199365134387e-07, |
|
"loss": 2.493, |
|
"step": 1051 |
|
}, |
|
{ |
|
"epoch": 0.9575605870975082, |
|
"grad_norm": 0.4336461126804352, |
|
"learning_rate": 9.177919950054237e-07, |
|
"loss": 2.4071, |
|
"step": 1052 |
|
}, |
|
{ |
|
"epoch": 0.9584708157924678, |
|
"grad_norm": 0.42869681119918823, |
|
"learning_rate": 8.792092173391831e-07, |
|
"loss": 2.3585, |
|
"step": 1053 |
|
}, |
|
{ |
|
"epoch": 0.9593810444874274, |
|
"grad_norm": 0.4060511887073517, |
|
"learning_rate": 8.41451353233369e-07, |
|
"loss": 2.3244, |
|
"step": 1054 |
|
}, |
|
{ |
|
"epoch": 0.960291273182387, |
|
"grad_norm": 0.42351028323173523, |
|
"learning_rate": 8.04518716920466e-07, |
|
"loss": 2.4194, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.9612015018773467, |
|
"grad_norm": 0.42555585503578186, |
|
"learning_rate": 7.684116157651966e-07, |
|
"loss": 2.368, |
|
"step": 1056 |
|
}, |
|
{ |
|
"epoch": 0.9621117305723063, |
|
"grad_norm": 0.4169003665447235, |
|
"learning_rate": 7.331303502618903e-07, |
|
"loss": 2.3947, |
|
"step": 1057 |
|
}, |
|
{ |
|
"epoch": 0.9630219592672659, |
|
"grad_norm": 0.43154704570770264, |
|
"learning_rate": 6.986752140320518e-07, |
|
"loss": 2.2809, |
|
"step": 1058 |
|
}, |
|
{ |
|
"epoch": 0.9639321879622255, |
|
"grad_norm": 0.40992870926856995, |
|
"learning_rate": 6.650464938218637e-07, |
|
"loss": 2.2827, |
|
"step": 1059 |
|
}, |
|
{ |
|
"epoch": 0.9648424166571851, |
|
"grad_norm": 0.4168105125427246, |
|
"learning_rate": 6.322444694998319e-07, |
|
"loss": 2.3334, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.9657526453521448, |
|
"grad_norm": 0.43256238102912903, |
|
"learning_rate": 6.002694140544329e-07, |
|
"loss": 2.2669, |
|
"step": 1061 |
|
}, |
|
{ |
|
"epoch": 0.9666628740471044, |
|
"grad_norm": 0.42290446162223816, |
|
"learning_rate": 5.691215935918815e-07, |
|
"loss": 2.1027, |
|
"step": 1062 |
|
}, |
|
{ |
|
"epoch": 0.967573102742064, |
|
"grad_norm": 0.4109112024307251, |
|
"learning_rate": 5.388012673338661e-07, |
|
"loss": 2.15, |
|
"step": 1063 |
|
}, |
|
{ |
|
"epoch": 0.9684833314370236, |
|
"grad_norm": 0.4292824864387512, |
|
"learning_rate": 5.093086876154174e-07, |
|
"loss": 2.2394, |
|
"step": 1064 |
|
}, |
|
{ |
|
"epoch": 0.9693935601319832, |
|
"grad_norm": 0.4136911928653717, |
|
"learning_rate": 4.80644099882821e-07, |
|
"loss": 2.1904, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.9703037888269428, |
|
"grad_norm": 0.4183078110218048, |
|
"learning_rate": 4.5280774269154115e-07, |
|
"loss": 2.1499, |
|
"step": 1066 |
|
}, |
|
{ |
|
"epoch": 0.9712140175219024, |
|
"grad_norm": 0.41571420431137085, |
|
"learning_rate": 4.2579984770426686e-07, |
|
"loss": 2.0975, |
|
"step": 1067 |
|
}, |
|
{ |
|
"epoch": 0.972124246216862, |
|
"grad_norm": 0.39653652906417847, |
|
"learning_rate": 3.99620639688969e-07, |
|
"loss": 1.9686, |
|
"step": 1068 |
|
}, |
|
{ |
|
"epoch": 0.9730344749118216, |
|
"grad_norm": 0.4133754372596741, |
|
"learning_rate": 3.742703365170241e-07, |
|
"loss": 2.0801, |
|
"step": 1069 |
|
}, |
|
{ |
|
"epoch": 0.9739447036067812, |
|
"grad_norm": 0.41434839367866516, |
|
"learning_rate": 3.497491491614158e-07, |
|
"loss": 1.9311, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.9748549323017408, |
|
"grad_norm": 0.41751164197921753, |
|
"learning_rate": 3.260572816949692e-07, |
|
"loss": 2.1841, |
|
"step": 1071 |
|
}, |
|
{ |
|
"epoch": 0.9757651609967004, |
|
"grad_norm": 0.41821038722991943, |
|
"learning_rate": 3.0319493128866396e-07, |
|
"loss": 2.0719, |
|
"step": 1072 |
|
}, |
|
{ |
|
"epoch": 0.97667538969166, |
|
"grad_norm": 0.4422387480735779, |
|
"learning_rate": 2.8116228820997957e-07, |
|
"loss": 2.2655, |
|
"step": 1073 |
|
}, |
|
{ |
|
"epoch": 0.9775856183866196, |
|
"grad_norm": 0.4252420961856842, |
|
"learning_rate": 2.5995953582130804e-07, |
|
"loss": 2.098, |
|
"step": 1074 |
|
}, |
|
{ |
|
"epoch": 0.9784958470815792, |
|
"grad_norm": 0.4352913498878479, |
|
"learning_rate": 2.395868505784438e-07, |
|
"loss": 2.1038, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.9794060757765388, |
|
"grad_norm": 0.43940603733062744, |
|
"learning_rate": 2.2004440202911814e-07, |
|
"loss": 2.1152, |
|
"step": 1076 |
|
}, |
|
{ |
|
"epoch": 0.9803163044714984, |
|
"grad_norm": 0.43396398425102234, |
|
"learning_rate": 2.0133235281156736e-07, |
|
"loss": 2.0175, |
|
"step": 1077 |
|
}, |
|
{ |
|
"epoch": 0.981226533166458, |
|
"grad_norm": 0.43348294496536255, |
|
"learning_rate": 1.83450858653178e-07, |
|
"loss": 2.0593, |
|
"step": 1078 |
|
}, |
|
{ |
|
"epoch": 0.9821367618614176, |
|
"grad_norm": 0.43314129114151, |
|
"learning_rate": 1.664000683692324e-07, |
|
"loss": 1.9414, |
|
"step": 1079 |
|
}, |
|
{ |
|
"epoch": 0.9830469905563773, |
|
"grad_norm": 0.47113651037216187, |
|
"learning_rate": 1.5018012386162072e-07, |
|
"loss": 2.3204, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.983957219251337, |
|
"grad_norm": 0.43835383653640747, |
|
"learning_rate": 1.3479116011769767e-07, |
|
"loss": 2.0271, |
|
"step": 1081 |
|
}, |
|
{ |
|
"epoch": 0.9848674479462965, |
|
"grad_norm": 0.42517396807670593, |
|
"learning_rate": 1.2023330520911646e-07, |
|
"loss": 1.8366, |
|
"step": 1082 |
|
}, |
|
{ |
|
"epoch": 0.9857776766412562, |
|
"grad_norm": 0.4593140482902527, |
|
"learning_rate": 1.0650668029079658e-07, |
|
"loss": 2.0962, |
|
"step": 1083 |
|
}, |
|
{ |
|
"epoch": 0.9866879053362158, |
|
"grad_norm": 0.4892318546772003, |
|
"learning_rate": 9.361139959993549e-08, |
|
"loss": 2.144, |
|
"step": 1084 |
|
}, |
|
{ |
|
"epoch": 0.9875981340311754, |
|
"grad_norm": 0.48190560936927795, |
|
"learning_rate": 8.154757045497619e-08, |
|
"loss": 2.1081, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.988508362726135, |
|
"grad_norm": 0.4542143940925598, |
|
"learning_rate": 7.0315293254819e-08, |
|
"loss": 1.9731, |
|
"step": 1086 |
|
}, |
|
{ |
|
"epoch": 0.9894185914210946, |
|
"grad_norm": 0.4879550337791443, |
|
"learning_rate": 5.991466147791113e-08, |
|
"loss": 2.0791, |
|
"step": 1087 |
|
}, |
|
{ |
|
"epoch": 0.9903288201160542, |
|
"grad_norm": 0.5068708062171936, |
|
"learning_rate": 5.0345761681491746e-08, |
|
"loss": 2.181, |
|
"step": 1088 |
|
}, |
|
{ |
|
"epoch": 0.9912390488110138, |
|
"grad_norm": 0.5021325945854187, |
|
"learning_rate": 4.1608673500859175e-08, |
|
"loss": 2.1546, |
|
"step": 1089 |
|
}, |
|
{ |
|
"epoch": 0.9921492775059734, |
|
"grad_norm": 0.5045585036277771, |
|
"learning_rate": 3.370346964876036e-08, |
|
"loss": 2.0335, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.993059506200933, |
|
"grad_norm": 0.5066413879394531, |
|
"learning_rate": 2.6630215914702495e-08, |
|
"loss": 2.0639, |
|
"step": 1091 |
|
}, |
|
{ |
|
"epoch": 0.9939697348958926, |
|
"grad_norm": 0.5761440992355347, |
|
"learning_rate": 2.038897116447558e-08, |
|
"loss": 2.3702, |
|
"step": 1092 |
|
}, |
|
{ |
|
"epoch": 0.9948799635908522, |
|
"grad_norm": 0.5774243474006653, |
|
"learning_rate": 1.4979787339619578e-08, |
|
"loss": 2.1865, |
|
"step": 1093 |
|
}, |
|
{ |
|
"epoch": 0.9957901922858118, |
|
"grad_norm": 0.6156805157661438, |
|
"learning_rate": 1.0402709457035808e-08, |
|
"loss": 2.3407, |
|
"step": 1094 |
|
}, |
|
{ |
|
"epoch": 0.9967004209807714, |
|
"grad_norm": 0.6527782082557678, |
|
"learning_rate": 6.657775608553962e-09, |
|
"loss": 2.3115, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.997610649675731, |
|
"grad_norm": 0.7234971523284912, |
|
"learning_rate": 3.745016960665648e-09, |
|
"loss": 2.5377, |
|
"step": 1096 |
|
}, |
|
{ |
|
"epoch": 0.9985208783706906, |
|
"grad_norm": 0.7837737798690796, |
|
"learning_rate": 1.6644577542357375e-09, |
|
"loss": 2.4761, |
|
"step": 1097 |
|
}, |
|
{ |
|
"epoch": 0.9994311070656502, |
|
"grad_norm": 1.1138982772827148, |
|
"learning_rate": 4.1611530431362453e-10, |
|
"loss": 2.4967, |
|
"step": 1098 |
|
}, |
|
{ |
|
"epoch": 1.0006826715212198, |
|
"grad_norm": 3.5543136596679688, |
|
"learning_rate": 0.0, |
|
"loss": 4.7799, |
|
"step": 1099 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 1099, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 275, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.0148357964895355e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|