|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.2500797448165869, |
|
"eval_steps": 294, |
|
"global_step": 294, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0008506113769271664, |
|
"grad_norm": 0.7357208728790283, |
|
"learning_rate": 2e-05, |
|
"loss": 2.8145, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0017012227538543328, |
|
"grad_norm": 0.771299421787262, |
|
"learning_rate": 4e-05, |
|
"loss": 3.195, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.002551834130781499, |
|
"grad_norm": 0.7344720363616943, |
|
"learning_rate": 6e-05, |
|
"loss": 2.8861, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0034024455077086655, |
|
"grad_norm": 0.7500324845314026, |
|
"learning_rate": 8e-05, |
|
"loss": 2.7421, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.004253056884635832, |
|
"grad_norm": 0.9078495502471924, |
|
"learning_rate": 0.0001, |
|
"loss": 2.9622, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.005103668261562998, |
|
"grad_norm": 1.0794708728790283, |
|
"learning_rate": 0.00012, |
|
"loss": 3.1124, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.005954279638490165, |
|
"grad_norm": 1.0218361616134644, |
|
"learning_rate": 0.00014, |
|
"loss": 2.7233, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.006804891015417331, |
|
"grad_norm": 1.059141755104065, |
|
"learning_rate": 0.00016, |
|
"loss": 2.8784, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.007655502392344498, |
|
"grad_norm": 0.4901650547981262, |
|
"learning_rate": 0.00018, |
|
"loss": 2.6192, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.008506113769271665, |
|
"grad_norm": 0.8344933390617371, |
|
"learning_rate": 0.0002, |
|
"loss": 2.6448, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.00935672514619883, |
|
"grad_norm": 1.5278894901275635, |
|
"learning_rate": 0.00019999963702861705, |
|
"loss": 2.7457, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.010207336523125997, |
|
"grad_norm": 1.2650033235549927, |
|
"learning_rate": 0.00019999854811710317, |
|
"loss": 2.7532, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.011057947900053162, |
|
"grad_norm": 0.740222156047821, |
|
"learning_rate": 0.0001999967332733632, |
|
"loss": 2.6836, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.01190855927698033, |
|
"grad_norm": 0.49257639050483704, |
|
"learning_rate": 0.0001999941925105719, |
|
"loss": 2.6658, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.012759170653907496, |
|
"grad_norm": 0.3310573399066925, |
|
"learning_rate": 0.00019999092584717374, |
|
"loss": 2.5043, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.013609782030834662, |
|
"grad_norm": 0.33361560106277466, |
|
"learning_rate": 0.00019998693330688282, |
|
"loss": 2.6252, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.014460393407761828, |
|
"grad_norm": 0.4449865221977234, |
|
"learning_rate": 0.00019998221491868273, |
|
"loss": 2.648, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.015311004784688996, |
|
"grad_norm": 0.4820970892906189, |
|
"learning_rate": 0.0001999767707168262, |
|
"loss": 2.7337, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.01616161616161616, |
|
"grad_norm": 0.5144203901290894, |
|
"learning_rate": 0.0001999706007408351, |
|
"loss": 2.6967, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.01701222753854333, |
|
"grad_norm": 0.501557469367981, |
|
"learning_rate": 0.0001999637050354999, |
|
"loss": 2.7318, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.017862838915470493, |
|
"grad_norm": 0.4480394423007965, |
|
"learning_rate": 0.00019995608365087946, |
|
"loss": 2.4126, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.01871345029239766, |
|
"grad_norm": 0.4459284842014313, |
|
"learning_rate": 0.00019994773664230064, |
|
"loss": 2.7072, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.01956406166932483, |
|
"grad_norm": 0.39909827709198, |
|
"learning_rate": 0.00019993866407035798, |
|
"loss": 2.6358, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.020414673046251993, |
|
"grad_norm": 0.36802783608436584, |
|
"learning_rate": 0.0001999288660009132, |
|
"loss": 2.6751, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.02126528442317916, |
|
"grad_norm": 0.43287962675094604, |
|
"learning_rate": 0.0001999183425050946, |
|
"loss": 2.7518, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.022115895800106325, |
|
"grad_norm": 0.4289425313472748, |
|
"learning_rate": 0.00019990709365929677, |
|
"loss": 2.7535, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.022966507177033493, |
|
"grad_norm": 0.4627043604850769, |
|
"learning_rate": 0.00019989511954517992, |
|
"loss": 2.8111, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.02381711855396066, |
|
"grad_norm": 0.4823961853981018, |
|
"learning_rate": 0.00019988242024966923, |
|
"loss": 2.9493, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.024667729930887825, |
|
"grad_norm": 0.4622437059879303, |
|
"learning_rate": 0.00019986899586495432, |
|
"loss": 2.788, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.025518341307814992, |
|
"grad_norm": 0.4963669776916504, |
|
"learning_rate": 0.00019985484648848853, |
|
"loss": 2.8304, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.02636895268474216, |
|
"grad_norm": 0.47957557439804077, |
|
"learning_rate": 0.00019983997222298828, |
|
"loss": 2.7323, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.027219564061669324, |
|
"grad_norm": 0.445528507232666, |
|
"learning_rate": 0.00019982437317643217, |
|
"loss": 3.015, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.028070175438596492, |
|
"grad_norm": 0.46085312962532043, |
|
"learning_rate": 0.00019980804946206036, |
|
"loss": 2.8556, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.028920786815523656, |
|
"grad_norm": 0.5078282356262207, |
|
"learning_rate": 0.0001997910011983737, |
|
"loss": 2.8472, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.029771398192450824, |
|
"grad_norm": 0.4612430930137634, |
|
"learning_rate": 0.00019977322850913283, |
|
"loss": 2.6399, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.03062200956937799, |
|
"grad_norm": 0.499965101480484, |
|
"learning_rate": 0.00019975473152335726, |
|
"loss": 2.9121, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.03147262094630516, |
|
"grad_norm": 0.5101069808006287, |
|
"learning_rate": 0.0001997355103753246, |
|
"loss": 2.8488, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.03232323232323232, |
|
"grad_norm": 0.5065872669219971, |
|
"learning_rate": 0.00019971556520456929, |
|
"loss": 2.8311, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.03317384370015949, |
|
"grad_norm": 0.5324426889419556, |
|
"learning_rate": 0.00019969489615588189, |
|
"loss": 2.7454, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.03402445507708666, |
|
"grad_norm": 0.5128815770149231, |
|
"learning_rate": 0.0001996735033793079, |
|
"loss": 2.8116, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.03487506645401382, |
|
"grad_norm": 0.5330538153648376, |
|
"learning_rate": 0.00019965138703014655, |
|
"loss": 2.7584, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.03572567783094099, |
|
"grad_norm": 0.556816577911377, |
|
"learning_rate": 0.00019962854726894997, |
|
"loss": 2.8902, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.03657628920786816, |
|
"grad_norm": 0.5452866554260254, |
|
"learning_rate": 0.0001996049842615217, |
|
"loss": 2.7984, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.03742690058479532, |
|
"grad_norm": 0.5836021304130554, |
|
"learning_rate": 0.0001995806981789157, |
|
"loss": 2.803, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.03827751196172249, |
|
"grad_norm": 0.5968561172485352, |
|
"learning_rate": 0.00019955568919743507, |
|
"loss": 2.8592, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.03912812333864966, |
|
"grad_norm": 0.6416970491409302, |
|
"learning_rate": 0.0001995299574986306, |
|
"loss": 2.7488, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.03997873471557682, |
|
"grad_norm": 0.704325795173645, |
|
"learning_rate": 0.0001995035032692998, |
|
"loss": 2.6983, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.040829346092503986, |
|
"grad_norm": 0.7766572833061218, |
|
"learning_rate": 0.00019947632670148517, |
|
"loss": 2.9677, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.04167995746943115, |
|
"grad_norm": 0.7186003923416138, |
|
"learning_rate": 0.00019944842799247308, |
|
"loss": 3.0728, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.04253056884635832, |
|
"grad_norm": 0.7572959065437317, |
|
"learning_rate": 0.00019941980734479214, |
|
"loss": 3.0345, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.043381180223285486, |
|
"grad_norm": 0.48461732268333435, |
|
"learning_rate": 0.00019939046496621194, |
|
"loss": 2.6307, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.04423179160021265, |
|
"grad_norm": 0.468675434589386, |
|
"learning_rate": 0.0001993604010697413, |
|
"loss": 2.4616, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.04508240297713982, |
|
"grad_norm": 0.3815957009792328, |
|
"learning_rate": 0.0001993296158736269, |
|
"loss": 2.7479, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.045933014354066985, |
|
"grad_norm": 0.3313361704349518, |
|
"learning_rate": 0.00019929810960135172, |
|
"loss": 2.4983, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.04678362573099415, |
|
"grad_norm": 0.32521429657936096, |
|
"learning_rate": 0.00019926588248163316, |
|
"loss": 2.5446, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.04763423710792132, |
|
"grad_norm": 0.2972453236579895, |
|
"learning_rate": 0.00019923293474842174, |
|
"loss": 2.5472, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.048484848484848485, |
|
"grad_norm": 0.2972238063812256, |
|
"learning_rate": 0.00019919926664089909, |
|
"loss": 2.5389, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.04933545986177565, |
|
"grad_norm": 0.27498453855514526, |
|
"learning_rate": 0.00019916487840347644, |
|
"loss": 2.571, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.05018607123870282, |
|
"grad_norm": 0.2938655614852905, |
|
"learning_rate": 0.00019912977028579268, |
|
"loss": 2.7134, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.051036682615629984, |
|
"grad_norm": 0.26742392778396606, |
|
"learning_rate": 0.0001990939425427127, |
|
"loss": 2.5632, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.05188729399255715, |
|
"grad_norm": 0.28117692470550537, |
|
"learning_rate": 0.00019905739543432536, |
|
"loss": 2.5297, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.05273790536948432, |
|
"grad_norm": 0.28916725516319275, |
|
"learning_rate": 0.00019902012922594177, |
|
"loss": 2.7096, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.053588516746411484, |
|
"grad_norm": 0.32468459010124207, |
|
"learning_rate": 0.0001989821441880933, |
|
"loss": 2.6192, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.05443912812333865, |
|
"grad_norm": 0.2806537449359894, |
|
"learning_rate": 0.0001989434405965295, |
|
"loss": 2.6747, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.05528973950026582, |
|
"grad_norm": 0.2876998782157898, |
|
"learning_rate": 0.0001989040187322164, |
|
"loss": 2.7443, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.056140350877192984, |
|
"grad_norm": 0.27619123458862305, |
|
"learning_rate": 0.00019886387888133413, |
|
"loss": 2.7379, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.05699096225412015, |
|
"grad_norm": 0.31479549407958984, |
|
"learning_rate": 0.000198823021335275, |
|
"loss": 2.4039, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.05784157363104731, |
|
"grad_norm": 0.300857812166214, |
|
"learning_rate": 0.00019878144639064144, |
|
"loss": 2.5705, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.05869218500797448, |
|
"grad_norm": 0.3776433765888214, |
|
"learning_rate": 0.00019873915434924375, |
|
"loss": 2.863, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.05954279638490165, |
|
"grad_norm": 0.30585938692092896, |
|
"learning_rate": 0.00019869614551809795, |
|
"loss": 2.5312, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.06039340776182881, |
|
"grad_norm": 0.3163856267929077, |
|
"learning_rate": 0.00019865242020942353, |
|
"loss": 2.8491, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.06124401913875598, |
|
"grad_norm": 0.30077147483825684, |
|
"learning_rate": 0.00019860797874064122, |
|
"loss": 2.7777, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.06209463051568315, |
|
"grad_norm": 0.4153176248073578, |
|
"learning_rate": 0.0001985628214343706, |
|
"loss": 2.7499, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.06294524189261032, |
|
"grad_norm": 0.35611122846603394, |
|
"learning_rate": 0.00019851694861842793, |
|
"loss": 2.7089, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.06379585326953748, |
|
"grad_norm": 0.3143812417984009, |
|
"learning_rate": 0.00019847036062582357, |
|
"loss": 2.758, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.06464646464646465, |
|
"grad_norm": 0.32024794816970825, |
|
"learning_rate": 0.00019842305779475968, |
|
"loss": 2.4616, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.06549707602339182, |
|
"grad_norm": 0.3146126866340637, |
|
"learning_rate": 0.00019837504046862775, |
|
"loss": 2.6104, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.06634768740031897, |
|
"grad_norm": 0.32578444480895996, |
|
"learning_rate": 0.00019832630899600608, |
|
"loss": 2.6297, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.06719829877724615, |
|
"grad_norm": 0.36873045563697815, |
|
"learning_rate": 0.00019827686373065728, |
|
"loss": 2.6358, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.06804891015417332, |
|
"grad_norm": 0.3558378517627716, |
|
"learning_rate": 0.00019822670503152567, |
|
"loss": 2.6308, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.06889952153110047, |
|
"grad_norm": 0.37967684864997864, |
|
"learning_rate": 0.00019817583326273467, |
|
"loss": 2.7577, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.06975013290802765, |
|
"grad_norm": 0.3737669885158539, |
|
"learning_rate": 0.00019812424879358425, |
|
"loss": 2.9207, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.07060074428495482, |
|
"grad_norm": 0.39410829544067383, |
|
"learning_rate": 0.0001980719519985481, |
|
"loss": 2.9544, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.07145135566188197, |
|
"grad_norm": 0.3863750696182251, |
|
"learning_rate": 0.00019801894325727104, |
|
"loss": 2.7794, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.07230196703880915, |
|
"grad_norm": 0.4226458966732025, |
|
"learning_rate": 0.0001979652229545662, |
|
"loss": 2.7491, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.07315257841573632, |
|
"grad_norm": 0.42758506536483765, |
|
"learning_rate": 0.0001979107914804122, |
|
"loss": 2.8524, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.07400318979266347, |
|
"grad_norm": 0.4379200041294098, |
|
"learning_rate": 0.0001978556492299504, |
|
"loss": 2.6526, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.07485380116959064, |
|
"grad_norm": 0.44331902265548706, |
|
"learning_rate": 0.000197799796603482, |
|
"loss": 2.8028, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.07570441254651782, |
|
"grad_norm": 0.4358711540699005, |
|
"learning_rate": 0.0001977432340064651, |
|
"loss": 2.5426, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.07655502392344497, |
|
"grad_norm": 0.45511335134506226, |
|
"learning_rate": 0.00019768596184951173, |
|
"loss": 2.7067, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.07740563530037214, |
|
"grad_norm": 0.5394377112388611, |
|
"learning_rate": 0.00019762798054838502, |
|
"loss": 2.8189, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.07825624667729932, |
|
"grad_norm": 0.5124706625938416, |
|
"learning_rate": 0.00019756929052399603, |
|
"loss": 2.7702, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.07910685805422647, |
|
"grad_norm": 0.5025349855422974, |
|
"learning_rate": 0.00019750989220240073, |
|
"loss": 2.6872, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.07995746943115364, |
|
"grad_norm": 0.5144663453102112, |
|
"learning_rate": 0.00019744978601479694, |
|
"loss": 2.6366, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.08080808080808081, |
|
"grad_norm": 0.5908443927764893, |
|
"learning_rate": 0.00019738897239752118, |
|
"loss": 2.7918, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.08165869218500797, |
|
"grad_norm": 0.6398508548736572, |
|
"learning_rate": 0.00019732745179204552, |
|
"loss": 2.9972, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.08250930356193514, |
|
"grad_norm": 0.6032273173332214, |
|
"learning_rate": 0.00019726522464497435, |
|
"loss": 2.7638, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.0833599149388623, |
|
"grad_norm": 0.6310097575187683, |
|
"learning_rate": 0.0001972022914080411, |
|
"loss": 2.9328, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.08421052631578947, |
|
"grad_norm": 0.7050711512565613, |
|
"learning_rate": 0.00019713865253810506, |
|
"loss": 2.8143, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.08506113769271664, |
|
"grad_norm": 0.755136251449585, |
|
"learning_rate": 0.00019707430849714807, |
|
"loss": 3.036, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0859117490696438, |
|
"grad_norm": 0.35153907537460327, |
|
"learning_rate": 0.00019700925975227096, |
|
"loss": 2.4444, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.08676236044657097, |
|
"grad_norm": 0.40153488516807556, |
|
"learning_rate": 0.0001969435067756904, |
|
"loss": 2.6068, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.08761297182349814, |
|
"grad_norm": 0.3474213480949402, |
|
"learning_rate": 0.00019687705004473545, |
|
"loss": 2.4261, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.0884635832004253, |
|
"grad_norm": 0.3283519744873047, |
|
"learning_rate": 0.00019680989004184382, |
|
"loss": 2.6736, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.08931419457735247, |
|
"grad_norm": 0.29034170508384705, |
|
"learning_rate": 0.00019674202725455877, |
|
"loss": 2.5551, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.09016480595427964, |
|
"grad_norm": 0.2918970584869385, |
|
"learning_rate": 0.00019667346217552527, |
|
"loss": 2.6039, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.0910154173312068, |
|
"grad_norm": 0.2852106988430023, |
|
"learning_rate": 0.00019660419530248655, |
|
"loss": 2.5432, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.09186602870813397, |
|
"grad_norm": 0.30997323989868164, |
|
"learning_rate": 0.0001965342271382805, |
|
"loss": 2.7324, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.09271664008506114, |
|
"grad_norm": 0.34156399965286255, |
|
"learning_rate": 0.00019646355819083589, |
|
"loss": 2.6548, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.0935672514619883, |
|
"grad_norm": 0.2763843238353729, |
|
"learning_rate": 0.00019639218897316883, |
|
"loss": 2.5254, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.09441786283891547, |
|
"grad_norm": 0.2835611402988434, |
|
"learning_rate": 0.00019632012000337908, |
|
"loss": 2.5677, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.09526847421584264, |
|
"grad_norm": 0.2940271198749542, |
|
"learning_rate": 0.00019624735180464602, |
|
"loss": 2.5976, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.0961190855927698, |
|
"grad_norm": 0.2714485824108124, |
|
"learning_rate": 0.00019617388490522517, |
|
"loss": 2.6087, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.09696969696969697, |
|
"grad_norm": 0.30371204018592834, |
|
"learning_rate": 0.00019609971983844412, |
|
"loss": 2.6129, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.09782030834662414, |
|
"grad_norm": 0.2762625813484192, |
|
"learning_rate": 0.0001960248571426989, |
|
"loss": 2.5759, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.0986709197235513, |
|
"grad_norm": 0.2702981233596802, |
|
"learning_rate": 0.00019594929736144976, |
|
"loss": 2.5443, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.09952153110047847, |
|
"grad_norm": 0.29210978746414185, |
|
"learning_rate": 0.00019587304104321746, |
|
"loss": 2.6425, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.10037214247740564, |
|
"grad_norm": 0.31620749831199646, |
|
"learning_rate": 0.00019579608874157928, |
|
"loss": 2.703, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.1012227538543328, |
|
"grad_norm": 0.2803102433681488, |
|
"learning_rate": 0.00019571844101516484, |
|
"loss": 2.6886, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.10207336523125997, |
|
"grad_norm": 0.30169349908828735, |
|
"learning_rate": 0.00019564009842765225, |
|
"loss": 2.8221, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.10292397660818714, |
|
"grad_norm": 0.297553151845932, |
|
"learning_rate": 0.00019556106154776379, |
|
"loss": 2.6897, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.1037745879851143, |
|
"grad_norm": 0.30721086263656616, |
|
"learning_rate": 0.000195481330949262, |
|
"loss": 2.6551, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.10462519936204147, |
|
"grad_norm": 0.29124605655670166, |
|
"learning_rate": 0.00019540090721094542, |
|
"loss": 2.6292, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.10547581073896864, |
|
"grad_norm": 0.31037285923957825, |
|
"learning_rate": 0.0001953197909166443, |
|
"loss": 2.5459, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.1063264221158958, |
|
"grad_norm": 0.3543750047683716, |
|
"learning_rate": 0.00019523798265521654, |
|
"loss": 2.5622, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.10717703349282297, |
|
"grad_norm": 0.3356544077396393, |
|
"learning_rate": 0.00019515548302054335, |
|
"loss": 2.7272, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.10802764486975014, |
|
"grad_norm": 0.34296396374702454, |
|
"learning_rate": 0.00019507229261152476, |
|
"loss": 2.6629, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.1088782562466773, |
|
"grad_norm": 0.34629112482070923, |
|
"learning_rate": 0.0001949884120320756, |
|
"loss": 2.6371, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.10972886762360447, |
|
"grad_norm": 0.34170377254486084, |
|
"learning_rate": 0.00019490384189112082, |
|
"loss": 2.7218, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.11057947900053164, |
|
"grad_norm": 0.38438230752944946, |
|
"learning_rate": 0.0001948185828025913, |
|
"loss": 2.7096, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.1114300903774588, |
|
"grad_norm": 0.40347060561180115, |
|
"learning_rate": 0.00019473263538541914, |
|
"loss": 2.8129, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.11228070175438597, |
|
"grad_norm": 0.3742891848087311, |
|
"learning_rate": 0.00019464600026353348, |
|
"loss": 2.7916, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.11313131313131314, |
|
"grad_norm": 0.4015231430530548, |
|
"learning_rate": 0.0001945586780658557, |
|
"loss": 2.6099, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.1139819245082403, |
|
"grad_norm": 0.40618133544921875, |
|
"learning_rate": 0.00019447066942629491, |
|
"loss": 2.6669, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.11483253588516747, |
|
"grad_norm": 0.4171842932701111, |
|
"learning_rate": 0.00019438197498374357, |
|
"loss": 2.6272, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.11568314726209462, |
|
"grad_norm": 0.443013995885849, |
|
"learning_rate": 0.0001942925953820725, |
|
"loss": 2.5722, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.1165337586390218, |
|
"grad_norm": 0.4636158347129822, |
|
"learning_rate": 0.00019420253127012645, |
|
"loss": 2.8075, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.11738437001594897, |
|
"grad_norm": 0.4271916151046753, |
|
"learning_rate": 0.00019411178330171937, |
|
"loss": 2.6875, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.11823498139287612, |
|
"grad_norm": 0.47826603055000305, |
|
"learning_rate": 0.00019402035213562954, |
|
"loss": 2.7042, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.1190855927698033, |
|
"grad_norm": 0.46729791164398193, |
|
"learning_rate": 0.0001939282384355949, |
|
"loss": 2.6663, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.11993620414673047, |
|
"grad_norm": 0.4689824879169464, |
|
"learning_rate": 0.0001938354428703082, |
|
"loss": 2.6138, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.12078681552365762, |
|
"grad_norm": 0.526096522808075, |
|
"learning_rate": 0.0001937419661134121, |
|
"loss": 2.9258, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.1216374269005848, |
|
"grad_norm": 0.5075511932373047, |
|
"learning_rate": 0.0001936478088434944, |
|
"loss": 2.8021, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.12248803827751197, |
|
"grad_norm": 0.5048439502716064, |
|
"learning_rate": 0.00019355297174408298, |
|
"loss": 2.6274, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.12333864965443912, |
|
"grad_norm": 0.5787357687950134, |
|
"learning_rate": 0.00019345745550364087, |
|
"loss": 2.851, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.1241892610313663, |
|
"grad_norm": 0.5641311407089233, |
|
"learning_rate": 0.00019336126081556134, |
|
"loss": 2.7681, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.12503987240829345, |
|
"grad_norm": 0.5504147410392761, |
|
"learning_rate": 0.00019326438837816276, |
|
"loss": 2.6905, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.12589048378522064, |
|
"grad_norm": 0.6101283431053162, |
|
"learning_rate": 0.00019316683889468358, |
|
"loss": 2.589, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.1267410951621478, |
|
"grad_norm": 0.7153661847114563, |
|
"learning_rate": 0.00019306861307327725, |
|
"loss": 2.9563, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.12759170653907495, |
|
"grad_norm": 0.7049738168716431, |
|
"learning_rate": 0.00019296971162700694, |
|
"loss": 2.8023, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.12844231791600214, |
|
"grad_norm": 0.3282754421234131, |
|
"learning_rate": 0.00019287013527384062, |
|
"loss": 2.4278, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.1292929292929293, |
|
"grad_norm": 0.350577712059021, |
|
"learning_rate": 0.00019276988473664557, |
|
"loss": 2.5845, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.13014354066985645, |
|
"grad_norm": 0.32433176040649414, |
|
"learning_rate": 0.00019266896074318334, |
|
"loss": 2.6126, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.13099415204678364, |
|
"grad_norm": 0.31844663619995117, |
|
"learning_rate": 0.00019256736402610436, |
|
"loss": 2.527, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.1318447634237108, |
|
"grad_norm": 0.2559802830219269, |
|
"learning_rate": 0.00019246509532294266, |
|
"loss": 2.2437, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.13269537480063795, |
|
"grad_norm": 0.28512275218963623, |
|
"learning_rate": 0.00019236215537611046, |
|
"loss": 2.5739, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.13354598617756513, |
|
"grad_norm": 0.26634740829467773, |
|
"learning_rate": 0.00019225854493289286, |
|
"loss": 2.4485, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.1343965975544923, |
|
"grad_norm": 0.2785400450229645, |
|
"learning_rate": 0.0001921542647454424, |
|
"loss": 2.7944, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.13524720893141945, |
|
"grad_norm": 0.27485981583595276, |
|
"learning_rate": 0.00019204931557077355, |
|
"loss": 2.6518, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.13609782030834663, |
|
"grad_norm": 0.2687318027019501, |
|
"learning_rate": 0.00019194369817075724, |
|
"loss": 2.6595, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.1369484316852738, |
|
"grad_norm": 0.26418977975845337, |
|
"learning_rate": 0.00019183741331211537, |
|
"loss": 2.7045, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.13779904306220095, |
|
"grad_norm": 0.28258347511291504, |
|
"learning_rate": 0.00019173046176641513, |
|
"loss": 2.5896, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.13864965443912813, |
|
"grad_norm": 0.27390146255493164, |
|
"learning_rate": 0.00019162284431006358, |
|
"loss": 2.5566, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.1395002658160553, |
|
"grad_norm": 0.2916048765182495, |
|
"learning_rate": 0.00019151456172430183, |
|
"loss": 2.609, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.14035087719298245, |
|
"grad_norm": 0.30684247612953186, |
|
"learning_rate": 0.00019140561479519955, |
|
"loss": 2.5222, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.14120148856990963, |
|
"grad_norm": 0.26836761832237244, |
|
"learning_rate": 0.00019129600431364897, |
|
"loss": 2.5891, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.1420520999468368, |
|
"grad_norm": 0.2658300995826721, |
|
"learning_rate": 0.00019118573107535953, |
|
"loss": 2.644, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.14290271132376395, |
|
"grad_norm": 0.2789425551891327, |
|
"learning_rate": 0.00019107479588085182, |
|
"loss": 2.5641, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.14375332270069113, |
|
"grad_norm": 0.2909972071647644, |
|
"learning_rate": 0.00019096319953545185, |
|
"loss": 2.5982, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.1446039340776183, |
|
"grad_norm": 0.3741363286972046, |
|
"learning_rate": 0.0001908509428492852, |
|
"loss": 2.6293, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.14545454545454545, |
|
"grad_norm": 0.2989426851272583, |
|
"learning_rate": 0.0001907380266372712, |
|
"loss": 2.7364, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.14630515683147263, |
|
"grad_norm": 0.28862622380256653, |
|
"learning_rate": 0.00019062445171911686, |
|
"loss": 2.5656, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.1471557682083998, |
|
"grad_norm": 0.3215920329093933, |
|
"learning_rate": 0.0001905102189193112, |
|
"loss": 2.8443, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.14800637958532695, |
|
"grad_norm": 0.2994636595249176, |
|
"learning_rate": 0.00019039532906711882, |
|
"loss": 2.7014, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.14885699096225413, |
|
"grad_norm": 0.32109183073043823, |
|
"learning_rate": 0.00019027978299657436, |
|
"loss": 2.8364, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.1497076023391813, |
|
"grad_norm": 0.30813783407211304, |
|
"learning_rate": 0.00019016358154647618, |
|
"loss": 2.5102, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.15055821371610845, |
|
"grad_norm": 0.32674533128738403, |
|
"learning_rate": 0.00019004672556038028, |
|
"loss": 2.757, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.15140882509303563, |
|
"grad_norm": 0.34680357575416565, |
|
"learning_rate": 0.00018992921588659422, |
|
"loss": 2.5228, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.1522594364699628, |
|
"grad_norm": 0.35170817375183105, |
|
"learning_rate": 0.00018981105337817104, |
|
"loss": 2.6148, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.15311004784688995, |
|
"grad_norm": 0.3741483986377716, |
|
"learning_rate": 0.00018969223889290284, |
|
"loss": 2.8025, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.15396065922381713, |
|
"grad_norm": 0.4156269431114197, |
|
"learning_rate": 0.00018957277329331485, |
|
"loss": 2.72, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.1548112706007443, |
|
"grad_norm": 0.3726477324962616, |
|
"learning_rate": 0.00018945265744665886, |
|
"loss": 2.6197, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.15566188197767145, |
|
"grad_norm": 0.4135706424713135, |
|
"learning_rate": 0.00018933189222490726, |
|
"loss": 2.7176, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.15651249335459863, |
|
"grad_norm": 0.38799911737442017, |
|
"learning_rate": 0.00018921047850474642, |
|
"loss": 2.5641, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.1573631047315258, |
|
"grad_norm": 0.4622843265533447, |
|
"learning_rate": 0.00018908841716757042, |
|
"loss": 2.7626, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.15821371610845295, |
|
"grad_norm": 0.4251146912574768, |
|
"learning_rate": 0.00018896570909947475, |
|
"loss": 2.6842, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.15906432748538013, |
|
"grad_norm": 0.4628697335720062, |
|
"learning_rate": 0.00018884235519124972, |
|
"loss": 2.9476, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.1599149388623073, |
|
"grad_norm": 0.5052159428596497, |
|
"learning_rate": 0.0001887183563383741, |
|
"loss": 2.769, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.16076555023923444, |
|
"grad_norm": 0.4817435145378113, |
|
"learning_rate": 0.00018859371344100864, |
|
"loss": 2.6266, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.16161616161616163, |
|
"grad_norm": 0.4751468598842621, |
|
"learning_rate": 0.0001884684274039894, |
|
"loss": 2.877, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.1624667729930888, |
|
"grad_norm": 0.5826165676116943, |
|
"learning_rate": 0.00018834249913682132, |
|
"loss": 2.7308, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.16331738437001594, |
|
"grad_norm": 0.5441760420799255, |
|
"learning_rate": 0.00018821592955367154, |
|
"loss": 2.6764, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.1641679957469431, |
|
"grad_norm": 0.5005947947502136, |
|
"learning_rate": 0.00018808871957336275, |
|
"loss": 2.664, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.1650186071238703, |
|
"grad_norm": 0.5205551981925964, |
|
"learning_rate": 0.00018796087011936665, |
|
"loss": 2.6192, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.16586921850079744, |
|
"grad_norm": 0.5489931106567383, |
|
"learning_rate": 0.0001878323821197971, |
|
"loss": 2.5061, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.1667198298777246, |
|
"grad_norm": 0.5525840520858765, |
|
"learning_rate": 0.00018770325650740345, |
|
"loss": 2.7474, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.1675704412546518, |
|
"grad_norm": 0.5978725552558899, |
|
"learning_rate": 0.0001875734942195637, |
|
"loss": 2.6055, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.16842105263157894, |
|
"grad_norm": 0.6148700714111328, |
|
"learning_rate": 0.0001874430961982778, |
|
"loss": 2.8352, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.1692716640085061, |
|
"grad_norm": 0.5956620573997498, |
|
"learning_rate": 0.0001873120633901608, |
|
"loss": 2.7367, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.17012227538543329, |
|
"grad_norm": 0.7082740664482117, |
|
"learning_rate": 0.0001871803967464358, |
|
"loss": 2.9437, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.17097288676236044, |
|
"grad_norm": 0.32244405150413513, |
|
"learning_rate": 0.00018704809722292737, |
|
"loss": 2.3835, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.1718234981392876, |
|
"grad_norm": 0.3367772102355957, |
|
"learning_rate": 0.00018691516578005427, |
|
"loss": 2.601, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.17267410951621479, |
|
"grad_norm": 0.31732872128486633, |
|
"learning_rate": 0.00018678160338282272, |
|
"loss": 2.5894, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.17352472089314194, |
|
"grad_norm": 0.27467650175094604, |
|
"learning_rate": 0.0001866474110008193, |
|
"loss": 2.4369, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.1743753322700691, |
|
"grad_norm": 0.29726937413215637, |
|
"learning_rate": 0.00018651258960820385, |
|
"loss": 2.6123, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.17522594364699628, |
|
"grad_norm": 0.27499106526374817, |
|
"learning_rate": 0.00018637714018370253, |
|
"loss": 2.5141, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.17607655502392344, |
|
"grad_norm": 0.27535390853881836, |
|
"learning_rate": 0.00018624106371060067, |
|
"loss": 2.5148, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.1769271664008506, |
|
"grad_norm": 0.2687024176120758, |
|
"learning_rate": 0.00018610436117673555, |
|
"loss": 2.6057, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.17777777777777778, |
|
"grad_norm": 0.31320950388908386, |
|
"learning_rate": 0.00018596703357448934, |
|
"loss": 2.6813, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.17862838915470494, |
|
"grad_norm": 0.25832033157348633, |
|
"learning_rate": 0.00018582908190078185, |
|
"loss": 2.4898, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.1794790005316321, |
|
"grad_norm": 0.2806166410446167, |
|
"learning_rate": 0.00018569050715706325, |
|
"loss": 2.5762, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.18032961190855928, |
|
"grad_norm": 0.26099708676338196, |
|
"learning_rate": 0.00018555131034930685, |
|
"loss": 2.5386, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.18118022328548644, |
|
"grad_norm": 0.26140880584716797, |
|
"learning_rate": 0.00018541149248800184, |
|
"loss": 2.7159, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.1820308346624136, |
|
"grad_norm": 0.2698177695274353, |
|
"learning_rate": 0.0001852710545881459, |
|
"loss": 2.5942, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.18288144603934078, |
|
"grad_norm": 0.27240726351737976, |
|
"learning_rate": 0.00018512999766923772, |
|
"loss": 2.5377, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.18373205741626794, |
|
"grad_norm": 0.2780822813510895, |
|
"learning_rate": 0.00018498832275526988, |
|
"loss": 2.6185, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.1845826687931951, |
|
"grad_norm": 0.2713901400566101, |
|
"learning_rate": 0.00018484603087472109, |
|
"loss": 2.5802, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.18543328017012228, |
|
"grad_norm": 0.2843954265117645, |
|
"learning_rate": 0.000184703123060549, |
|
"loss": 2.6404, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.18628389154704944, |
|
"grad_norm": 0.2679051160812378, |
|
"learning_rate": 0.0001845596003501826, |
|
"loss": 2.6688, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.1871345029239766, |
|
"grad_norm": 0.292568176984787, |
|
"learning_rate": 0.00018441546378551458, |
|
"loss": 2.6505, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.18798511430090378, |
|
"grad_norm": 0.282326877117157, |
|
"learning_rate": 0.00018427071441289388, |
|
"loss": 2.6299, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.18883572567783094, |
|
"grad_norm": 0.2853985130786896, |
|
"learning_rate": 0.00018412535328311814, |
|
"loss": 2.8143, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.1896863370547581, |
|
"grad_norm": 0.2786814868450165, |
|
"learning_rate": 0.00018397938145142591, |
|
"loss": 2.6007, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.19053694843168528, |
|
"grad_norm": 0.42460358142852783, |
|
"learning_rate": 0.0001838327999774892, |
|
"loss": 2.7891, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.19138755980861244, |
|
"grad_norm": 0.30478838086128235, |
|
"learning_rate": 0.00018368560992540562, |
|
"loss": 2.4551, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.1922381711855396, |
|
"grad_norm": 0.3402044177055359, |
|
"learning_rate": 0.00018353781236369064, |
|
"loss": 2.9191, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.19308878256246678, |
|
"grad_norm": 0.33662521839141846, |
|
"learning_rate": 0.00018338940836527004, |
|
"loss": 2.5606, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.19393939393939394, |
|
"grad_norm": 0.34461426734924316, |
|
"learning_rate": 0.0001832403990074719, |
|
"loss": 2.714, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.1947900053163211, |
|
"grad_norm": 0.342184454202652, |
|
"learning_rate": 0.0001830907853720188, |
|
"loss": 2.6936, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.19564061669324828, |
|
"grad_norm": 0.3557281494140625, |
|
"learning_rate": 0.0001829405685450202, |
|
"loss": 2.6663, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.19649122807017544, |
|
"grad_norm": 0.38674700260162354, |
|
"learning_rate": 0.0001827897496169642, |
|
"loss": 2.7257, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.1973418394471026, |
|
"grad_norm": 0.3849089741706848, |
|
"learning_rate": 0.00018263832968271, |
|
"loss": 2.7178, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.19819245082402978, |
|
"grad_norm": 0.4508901834487915, |
|
"learning_rate": 0.00018248630984147955, |
|
"loss": 2.7947, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.19904306220095694, |
|
"grad_norm": 0.39502936601638794, |
|
"learning_rate": 0.00018233369119684996, |
|
"loss": 2.5885, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.1998936735778841, |
|
"grad_norm": 0.4287837743759155, |
|
"learning_rate": 0.00018218047485674523, |
|
"loss": 2.6911, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.20074428495481128, |
|
"grad_norm": 0.4257849454879761, |
|
"learning_rate": 0.00018202666193342833, |
|
"loss": 2.8803, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.20159489633173844, |
|
"grad_norm": 0.4459477961063385, |
|
"learning_rate": 0.00018187225354349295, |
|
"loss": 2.8352, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.2024455077086656, |
|
"grad_norm": 0.4430312514305115, |
|
"learning_rate": 0.0001817172508078557, |
|
"loss": 2.7517, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.20329611908559278, |
|
"grad_norm": 0.4465429484844208, |
|
"learning_rate": 0.00018156165485174773, |
|
"loss": 2.7119, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.20414673046251994, |
|
"grad_norm": 0.4532601833343506, |
|
"learning_rate": 0.00018140546680470659, |
|
"loss": 2.7346, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.2049973418394471, |
|
"grad_norm": 0.4750036299228668, |
|
"learning_rate": 0.00018124868780056814, |
|
"loss": 2.6113, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.20584795321637428, |
|
"grad_norm": 0.5072234272956848, |
|
"learning_rate": 0.00018109131897745822, |
|
"loss": 2.844, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.20669856459330144, |
|
"grad_norm": 0.5094662308692932, |
|
"learning_rate": 0.00018093336147778438, |
|
"loss": 2.7737, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.2075491759702286, |
|
"grad_norm": 0.606842577457428, |
|
"learning_rate": 0.00018077481644822768, |
|
"loss": 2.6153, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.20839978734715578, |
|
"grad_norm": 0.5311163067817688, |
|
"learning_rate": 0.00018061568503973435, |
|
"loss": 2.6038, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.20925039872408294, |
|
"grad_norm": 0.5758761167526245, |
|
"learning_rate": 0.00018045596840750723, |
|
"loss": 2.6446, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.2101010101010101, |
|
"grad_norm": 0.598297119140625, |
|
"learning_rate": 0.00018029566771099776, |
|
"loss": 2.7002, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.21095162147793728, |
|
"grad_norm": 0.6635774970054626, |
|
"learning_rate": 0.00018013478411389716, |
|
"loss": 2.8011, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.21180223285486444, |
|
"grad_norm": 0.6850919723510742, |
|
"learning_rate": 0.00017997331878412835, |
|
"loss": 2.8903, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.2126528442317916, |
|
"grad_norm": 0.7298348546028137, |
|
"learning_rate": 0.00017981127289383716, |
|
"loss": 2.9483, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.21350345560871878, |
|
"grad_norm": 0.33354559540748596, |
|
"learning_rate": 0.00017964864761938404, |
|
"loss": 2.4727, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.21435406698564594, |
|
"grad_norm": 0.3557465374469757, |
|
"learning_rate": 0.00017948544414133534, |
|
"loss": 2.5058, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.2152046783625731, |
|
"grad_norm": 0.3230442702770233, |
|
"learning_rate": 0.00017932166364445498, |
|
"loss": 2.5422, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.21605528973950028, |
|
"grad_norm": 0.28668278455734253, |
|
"learning_rate": 0.0001791573073176956, |
|
"loss": 2.3173, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.21690590111642744, |
|
"grad_norm": 0.30019721388816833, |
|
"learning_rate": 0.00017899237635419002, |
|
"loss": 2.6444, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.2177565124933546, |
|
"grad_norm": 0.285314679145813, |
|
"learning_rate": 0.0001788268719512427, |
|
"loss": 2.5319, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.21860712387028178, |
|
"grad_norm": 0.27584996819496155, |
|
"learning_rate": 0.00017866079531032088, |
|
"loss": 2.6496, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.21945773524720893, |
|
"grad_norm": 0.2874069809913635, |
|
"learning_rate": 0.0001784941476370459, |
|
"loss": 2.5156, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.2203083466241361, |
|
"grad_norm": 0.26786255836486816, |
|
"learning_rate": 0.00017832693014118448, |
|
"loss": 2.6211, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.22115895800106328, |
|
"grad_norm": 0.2633914351463318, |
|
"learning_rate": 0.0001781591440366399, |
|
"loss": 2.5811, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.22200956937799043, |
|
"grad_norm": 0.2724866569042206, |
|
"learning_rate": 0.00017799079054144334, |
|
"loss": 2.5904, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.2228601807549176, |
|
"grad_norm": 0.29333001375198364, |
|
"learning_rate": 0.00017782187087774477, |
|
"loss": 2.7581, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.22371079213184478, |
|
"grad_norm": 0.2735550105571747, |
|
"learning_rate": 0.00017765238627180424, |
|
"loss": 2.7114, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.22456140350877193, |
|
"grad_norm": 0.2721397280693054, |
|
"learning_rate": 0.00017748233795398307, |
|
"loss": 2.5991, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.2254120148856991, |
|
"grad_norm": 0.25755858421325684, |
|
"learning_rate": 0.0001773117271587346, |
|
"loss": 2.5786, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.22626262626262628, |
|
"grad_norm": 0.25772804021835327, |
|
"learning_rate": 0.00017714055512459565, |
|
"loss": 2.488, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.22711323763955343, |
|
"grad_norm": 0.2766227424144745, |
|
"learning_rate": 0.0001769688230941772, |
|
"loss": 2.8924, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.2279638490164806, |
|
"grad_norm": 0.26846593618392944, |
|
"learning_rate": 0.00017679653231415552, |
|
"loss": 2.5783, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.22881446039340775, |
|
"grad_norm": 0.26374372839927673, |
|
"learning_rate": 0.00017662368403526302, |
|
"loss": 2.4675, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.22966507177033493, |
|
"grad_norm": 0.28237268328666687, |
|
"learning_rate": 0.0001764502795122793, |
|
"loss": 2.5994, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.2305156831472621, |
|
"grad_norm": 0.2786102890968323, |
|
"learning_rate": 0.00017627632000402193, |
|
"loss": 2.514, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.23136629452418925, |
|
"grad_norm": 0.27646180987358093, |
|
"learning_rate": 0.00017610180677333739, |
|
"loss": 2.5673, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.23221690590111643, |
|
"grad_norm": 0.3052549660205841, |
|
"learning_rate": 0.00017592674108709186, |
|
"loss": 2.5345, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.2330675172780436, |
|
"grad_norm": 0.30554690957069397, |
|
"learning_rate": 0.00017575112421616202, |
|
"loss": 2.709, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.23391812865497075, |
|
"grad_norm": 0.3219161331653595, |
|
"learning_rate": 0.00017557495743542585, |
|
"loss": 2.6825, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.23476874003189793, |
|
"grad_norm": 0.31834957003593445, |
|
"learning_rate": 0.0001753982420237533, |
|
"loss": 2.7017, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.2356193514088251, |
|
"grad_norm": 0.30264872312545776, |
|
"learning_rate": 0.00017522097926399722, |
|
"loss": 2.3725, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.23646996278575225, |
|
"grad_norm": 0.3283548951148987, |
|
"learning_rate": 0.00017504317044298367, |
|
"loss": 2.6217, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.23732057416267943, |
|
"grad_norm": 0.33564746379852295, |
|
"learning_rate": 0.00017486481685150302, |
|
"loss": 2.5738, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.2381711855396066, |
|
"grad_norm": 0.37258434295654297, |
|
"learning_rate": 0.0001746859197843002, |
|
"loss": 2.783, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.23902179691653375, |
|
"grad_norm": 0.3897363245487213, |
|
"learning_rate": 0.0001745064805400656, |
|
"loss": 2.7908, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.23987240829346093, |
|
"grad_norm": 0.3756699562072754, |
|
"learning_rate": 0.00017432650042142536, |
|
"loss": 2.5944, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.2407230196703881, |
|
"grad_norm": 0.3787755072116852, |
|
"learning_rate": 0.00017414598073493216, |
|
"loss": 2.7574, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.24157363104731525, |
|
"grad_norm": 0.38891106843948364, |
|
"learning_rate": 0.0001739649227910556, |
|
"loss": 2.8635, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.24242424242424243, |
|
"grad_norm": 0.40293633937835693, |
|
"learning_rate": 0.00017378332790417273, |
|
"loss": 2.729, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.2432748538011696, |
|
"grad_norm": 0.414109468460083, |
|
"learning_rate": 0.00017360119739255852, |
|
"loss": 2.6077, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.24412546517809675, |
|
"grad_norm": 0.42549028992652893, |
|
"learning_rate": 0.0001734185325783762, |
|
"loss": 2.7812, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.24497607655502393, |
|
"grad_norm": 0.42882055044174194, |
|
"learning_rate": 0.00017323533478766777, |
|
"loss": 2.7653, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.2458266879319511, |
|
"grad_norm": 0.42119139432907104, |
|
"learning_rate": 0.00017305160535034436, |
|
"loss": 2.5355, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.24667729930887825, |
|
"grad_norm": 0.4749990999698639, |
|
"learning_rate": 0.0001728673456001766, |
|
"loss": 2.7885, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.24752791068580543, |
|
"grad_norm": 0.4682268500328064, |
|
"learning_rate": 0.00017268255687478469, |
|
"loss": 2.6402, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.2483785220627326, |
|
"grad_norm": 0.4854019284248352, |
|
"learning_rate": 0.00017249724051562906, |
|
"loss": 2.7255, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.24922913343965974, |
|
"grad_norm": 0.5112527012825012, |
|
"learning_rate": 0.00017231139786800042, |
|
"loss": 2.8374, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.2500797448165869, |
|
"grad_norm": 0.5242344737052917, |
|
"learning_rate": 0.0001721250302810101, |
|
"loss": 2.9178, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.2500797448165869, |
|
"eval_loss": 2.688343048095703, |
|
"eval_runtime": 80.6326, |
|
"eval_samples_per_second": 12.278, |
|
"eval_steps_per_second": 6.139, |
|
"step": 294 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 1176, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 294, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.987046260755661e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|