Estwld's picture
Upload 15 files
c99edfc verified
raw
history blame
20.4 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 1000,
"global_step": 1110,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.04504504504504504,
"grad_norm": 3.3412909507751465,
"learning_rate": 1.801801801801802e-05,
"loss": 1.2791,
"step": 10
},
{
"epoch": 0.09009009009009009,
"grad_norm": 1.4792481660842896,
"learning_rate": 3.603603603603604e-05,
"loss": 1.0799,
"step": 20
},
{
"epoch": 0.13513513513513514,
"grad_norm": 1.3788402080535889,
"learning_rate": 5.405405405405406e-05,
"loss": 0.7657,
"step": 30
},
{
"epoch": 0.18018018018018017,
"grad_norm": 0.7668061256408691,
"learning_rate": 7.207207207207208e-05,
"loss": 0.5807,
"step": 40
},
{
"epoch": 0.22522522522522523,
"grad_norm": 0.7166613936424255,
"learning_rate": 9.009009009009009e-05,
"loss": 0.6035,
"step": 50
},
{
"epoch": 0.2702702702702703,
"grad_norm": 0.9661350846290588,
"learning_rate": 0.00010810810810810812,
"loss": 0.5441,
"step": 60
},
{
"epoch": 0.3153153153153153,
"grad_norm": 0.7341681122779846,
"learning_rate": 0.00012612612612612612,
"loss": 0.6031,
"step": 70
},
{
"epoch": 0.36036036036036034,
"grad_norm": 1.3319752216339111,
"learning_rate": 0.00014414414414414415,
"loss": 0.5686,
"step": 80
},
{
"epoch": 0.40540540540540543,
"grad_norm": 0.7269447445869446,
"learning_rate": 0.00016216216216216218,
"loss": 0.4978,
"step": 90
},
{
"epoch": 0.45045045045045046,
"grad_norm": 0.46780362725257874,
"learning_rate": 0.00018018018018018018,
"loss": 0.473,
"step": 100
},
{
"epoch": 0.4954954954954955,
"grad_norm": 0.728823184967041,
"learning_rate": 0.0001981981981981982,
"loss": 0.5892,
"step": 110
},
{
"epoch": 0.5405405405405406,
"grad_norm": 0.5054831504821777,
"learning_rate": 0.0001999599507118322,
"loss": 0.5117,
"step": 120
},
{
"epoch": 0.5855855855855856,
"grad_norm": 0.6193355321884155,
"learning_rate": 0.00019982154991201608,
"loss": 0.4759,
"step": 130
},
{
"epoch": 0.6306306306306306,
"grad_norm": 0.3995501399040222,
"learning_rate": 0.00019958443999073397,
"loss": 0.3928,
"step": 140
},
{
"epoch": 0.6756756756756757,
"grad_norm": 0.5497131943702698,
"learning_rate": 0.0001992488554155135,
"loss": 0.4507,
"step": 150
},
{
"epoch": 0.7207207207207207,
"grad_norm": 1.0290075540542603,
"learning_rate": 0.00019881512803111796,
"loss": 0.4766,
"step": 160
},
{
"epoch": 0.7657657657657657,
"grad_norm": 0.6826834678649902,
"learning_rate": 0.00019828368673139947,
"loss": 0.5236,
"step": 170
},
{
"epoch": 0.8108108108108109,
"grad_norm": 0.6135373711585999,
"learning_rate": 0.00019765505703518496,
"loss": 0.4454,
"step": 180
},
{
"epoch": 0.8558558558558559,
"grad_norm": 0.6352598667144775,
"learning_rate": 0.00019692986056661356,
"loss": 0.508,
"step": 190
},
{
"epoch": 0.9009009009009009,
"grad_norm": 0.5680545568466187,
"learning_rate": 0.0001961088144404403,
"loss": 0.5896,
"step": 200
},
{
"epoch": 0.9459459459459459,
"grad_norm": 0.3386252820491791,
"learning_rate": 0.00019519273055291266,
"loss": 0.4729,
"step": 210
},
{
"epoch": 0.990990990990991,
"grad_norm": 0.358553409576416,
"learning_rate": 0.0001941825147789225,
"loss": 0.45,
"step": 220
},
{
"epoch": 1.0360360360360361,
"grad_norm": 0.668021023273468,
"learning_rate": 0.0001930791660762262,
"loss": 0.4162,
"step": 230
},
{
"epoch": 1.0810810810810811,
"grad_norm": 0.5463367700576782,
"learning_rate": 0.00019188377549761963,
"loss": 0.4445,
"step": 240
},
{
"epoch": 1.1261261261261262,
"grad_norm": 0.7385880351066589,
"learning_rate": 0.000190597525112044,
"loss": 0.3849,
"step": 250
},
{
"epoch": 1.1711711711711712,
"grad_norm": 0.6837536692619324,
"learning_rate": 0.0001892216868356904,
"loss": 0.4652,
"step": 260
},
{
"epoch": 1.2162162162162162,
"grad_norm": 0.866578221321106,
"learning_rate": 0.00018775762117425777,
"loss": 0.4648,
"step": 270
},
{
"epoch": 1.2612612612612613,
"grad_norm": 0.6583455204963684,
"learning_rate": 0.00018620677587760916,
"loss": 0.3848,
"step": 280
},
{
"epoch": 1.3063063063063063,
"grad_norm": 0.6937561631202698,
"learning_rate": 0.00018457068450815562,
"loss": 0.4532,
"step": 290
},
{
"epoch": 1.3513513513513513,
"grad_norm": 0.5930050611495972,
"learning_rate": 0.00018285096492438424,
"loss": 0.5282,
"step": 300
},
{
"epoch": 1.3963963963963963,
"grad_norm": 1.1432991027832031,
"learning_rate": 0.0001810493176810292,
"loss": 0.4369,
"step": 310
},
{
"epoch": 1.4414414414414414,
"grad_norm": 0.5736434459686279,
"learning_rate": 0.00017916752434746856,
"loss": 0.4434,
"step": 320
},
{
"epoch": 1.4864864864864864,
"grad_norm": 0.5792914032936096,
"learning_rate": 0.00017720744574600863,
"loss": 0.4434,
"step": 330
},
{
"epoch": 1.5315315315315314,
"grad_norm": 0.7626290917396545,
"learning_rate": 0.00017517102011179933,
"loss": 0.4226,
"step": 340
},
{
"epoch": 1.5765765765765765,
"grad_norm": 0.6746386289596558,
"learning_rate": 0.00017306026117619889,
"loss": 0.4126,
"step": 350
},
{
"epoch": 1.6216216216216215,
"grad_norm": 0.5064342617988586,
"learning_rate": 0.00017087725617548385,
"loss": 0.3926,
"step": 360
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.793991208076477,
"learning_rate": 0.0001686241637868734,
"loss": 0.4437,
"step": 370
},
{
"epoch": 1.7117117117117115,
"grad_norm": 0.6197868585586548,
"learning_rate": 0.00016630321199390867,
"loss": 0.3932,
"step": 380
},
{
"epoch": 1.7567567567567568,
"grad_norm": 0.5004612803459167,
"learning_rate": 0.0001639166958832985,
"loss": 0.3883,
"step": 390
},
{
"epoch": 1.8018018018018018,
"grad_norm": 0.7265865206718445,
"learning_rate": 0.00016146697537540924,
"loss": 0.4453,
"step": 400
},
{
"epoch": 1.8468468468468469,
"grad_norm": 0.5155379772186279,
"learning_rate": 0.00015895647289064396,
"loss": 0.48,
"step": 410
},
{
"epoch": 1.8918918918918919,
"grad_norm": 0.5756716132164001,
"learning_rate": 0.0001563876709540178,
"loss": 0.4874,
"step": 420
},
{
"epoch": 1.936936936936937,
"grad_norm": 0.7919459342956543,
"learning_rate": 0.00015376310974029873,
"loss": 0.4075,
"step": 430
},
{
"epoch": 1.981981981981982,
"grad_norm": 0.5977569818496704,
"learning_rate": 0.0001510853845621409,
"loss": 0.504,
"step": 440
},
{
"epoch": 2.027027027027027,
"grad_norm": 0.601466178894043,
"learning_rate": 0.00014835714330369446,
"loss": 0.3732,
"step": 450
},
{
"epoch": 2.0720720720720722,
"grad_norm": 0.575406014919281,
"learning_rate": 0.00014558108380223012,
"loss": 0.3317,
"step": 460
},
{
"epoch": 2.1171171171171173,
"grad_norm": 1.0440267324447632,
"learning_rate": 0.00014275995118036693,
"loss": 0.3896,
"step": 470
},
{
"epoch": 2.1621621621621623,
"grad_norm": 0.6646713614463806,
"learning_rate": 0.00013989653513154165,
"loss": 0.3478,
"step": 480
},
{
"epoch": 2.2072072072072073,
"grad_norm": 0.7202288508415222,
"learning_rate": 0.00013699366716140435,
"loss": 0.3712,
"step": 490
},
{
"epoch": 2.2522522522522523,
"grad_norm": 0.6566169261932373,
"learning_rate": 0.00013405421778786737,
"loss": 0.3548,
"step": 500
},
{
"epoch": 2.2972972972972974,
"grad_norm": 1.0158584117889404,
"learning_rate": 0.00013108109370257712,
"loss": 0.3404,
"step": 510
},
{
"epoch": 2.3423423423423424,
"grad_norm": 0.7582752108573914,
"learning_rate": 0.00012807723489661495,
"loss": 0.374,
"step": 520
},
{
"epoch": 2.3873873873873874,
"grad_norm": 0.7685467600822449,
"learning_rate": 0.00012504561175326985,
"loss": 0.3127,
"step": 530
},
{
"epoch": 2.4324324324324325,
"grad_norm": 0.9077286124229431,
"learning_rate": 0.00012198922211075778,
"loss": 0.353,
"step": 540
},
{
"epoch": 2.4774774774774775,
"grad_norm": 0.9107437133789062,
"learning_rate": 0.00011891108829779165,
"loss": 0.3531,
"step": 550
},
{
"epoch": 2.5225225225225225,
"grad_norm": 1.1385325193405151,
"learning_rate": 0.0001158142541449341,
"loss": 0.3695,
"step": 560
},
{
"epoch": 2.5675675675675675,
"grad_norm": 0.9225629568099976,
"learning_rate": 0.00011270178197468789,
"loss": 0.3606,
"step": 570
},
{
"epoch": 2.6126126126126126,
"grad_norm": 0.6338076591491699,
"learning_rate": 0.00010957674957330042,
"loss": 0.312,
"step": 580
},
{
"epoch": 2.6576576576576576,
"grad_norm": 1.3998136520385742,
"learning_rate": 0.00010644224714727681,
"loss": 0.4027,
"step": 590
},
{
"epoch": 2.7027027027027026,
"grad_norm": 0.598822832107544,
"learning_rate": 0.00010330137426761135,
"loss": 0.3496,
"step": 600
},
{
"epoch": 2.7477477477477477,
"grad_norm": 0.9068642854690552,
"learning_rate": 0.00010015723680475846,
"loss": 0.3489,
"step": 610
},
{
"epoch": 2.7927927927927927,
"grad_norm": 0.4025176167488098,
"learning_rate": 9.70129438573747e-05,
"loss": 0.3296,
"step": 620
},
{
"epoch": 2.8378378378378377,
"grad_norm": 0.6708613634109497,
"learning_rate": 9.38716046778684e-05,
"loss": 0.3004,
"step": 630
},
{
"epoch": 2.8828828828828827,
"grad_norm": 0.7858556509017944,
"learning_rate": 9.07363255977973e-05,
"loss": 0.3716,
"step": 640
},
{
"epoch": 2.9279279279279278,
"grad_norm": 0.6855165958404541,
"learning_rate": 8.76102069561545e-05,
"loss": 0.311,
"step": 650
},
{
"epoch": 2.972972972972973,
"grad_norm": 0.6526620388031006,
"learning_rate": 8.449634003358022e-05,
"loss": 0.3488,
"step": 660
},
{
"epoch": 3.018018018018018,
"grad_norm": 0.3698066174983978,
"learning_rate": 8.13978039955308e-05,
"loss": 0.2858,
"step": 670
},
{
"epoch": 3.063063063063063,
"grad_norm": 0.8586738705635071,
"learning_rate": 7.831766284742807e-05,
"loss": 0.2565,
"step": 680
},
{
"epoch": 3.108108108108108,
"grad_norm": 0.8718597292900085,
"learning_rate": 7.525896240479976e-05,
"loss": 0.2173,
"step": 690
},
{
"epoch": 3.153153153153153,
"grad_norm": 0.7671772241592407,
"learning_rate": 7.222472728140695e-05,
"loss": 0.2548,
"step": 700
},
{
"epoch": 3.1981981981981984,
"grad_norm": 1.2702572345733643,
"learning_rate": 6.921795789833723e-05,
"loss": 0.2638,
"step": 710
},
{
"epoch": 3.2432432432432434,
"grad_norm": 1.4898873567581177,
"learning_rate": 6.624162751702076e-05,
"loss": 0.2623,
"step": 720
},
{
"epoch": 3.2882882882882885,
"grad_norm": 1.0137726068496704,
"learning_rate": 6.329867929910347e-05,
"loss": 0.2938,
"step": 730
},
{
"epoch": 3.3333333333333335,
"grad_norm": 0.9631416201591492,
"learning_rate": 6.039202339608432e-05,
"loss": 0.2443,
"step": 740
},
{
"epoch": 3.3783783783783785,
"grad_norm": 0.8912140130996704,
"learning_rate": 5.752453407159522e-05,
"loss": 0.2359,
"step": 750
},
{
"epoch": 3.4234234234234235,
"grad_norm": 0.9686083793640137,
"learning_rate": 5.469904685916861e-05,
"loss": 0.2465,
"step": 760
},
{
"epoch": 3.4684684684684686,
"grad_norm": 1.942658543586731,
"learning_rate": 5.191835575830352e-05,
"loss": 0.3042,
"step": 770
},
{
"epoch": 3.5135135135135136,
"grad_norm": 1.2755067348480225,
"learning_rate": 4.918521047160308e-05,
"loss": 0.2885,
"step": 780
},
{
"epoch": 3.5585585585585586,
"grad_norm": 0.8992679715156555,
"learning_rate": 4.650231368571486e-05,
"loss": 0.2728,
"step": 790
},
{
"epoch": 3.6036036036036037,
"grad_norm": 2.1154401302337646,
"learning_rate": 4.387231839876349e-05,
"loss": 0.258,
"step": 800
},
{
"epoch": 3.6486486486486487,
"grad_norm": 0.6957826018333435,
"learning_rate": 4.129782529691815e-05,
"loss": 0.3219,
"step": 810
},
{
"epoch": 3.6936936936936937,
"grad_norm": 0.8093072175979614,
"learning_rate": 3.878138018268866e-05,
"loss": 0.2318,
"step": 820
},
{
"epoch": 3.7387387387387387,
"grad_norm": 0.900164008140564,
"learning_rate": 3.632547145749395e-05,
"loss": 0.3025,
"step": 830
},
{
"epoch": 3.7837837837837838,
"grad_norm": 1.3732051849365234,
"learning_rate": 3.393252766099187e-05,
"loss": 0.2744,
"step": 840
},
{
"epoch": 3.828828828828829,
"grad_norm": 1.3438997268676758,
"learning_rate": 3.1604915069603436e-05,
"loss": 0.2663,
"step": 850
},
{
"epoch": 3.873873873873874,
"grad_norm": 0.7277224063873291,
"learning_rate": 2.9344935356606773e-05,
"loss": 0.2058,
"step": 860
},
{
"epoch": 3.918918918918919,
"grad_norm": 0.9671199321746826,
"learning_rate": 2.7154823316113932e-05,
"loss": 0.2466,
"step": 870
},
{
"epoch": 3.963963963963964,
"grad_norm": 1.0856068134307861,
"learning_rate": 2.5036744653181753e-05,
"loss": 0.2695,
"step": 880
},
{
"epoch": 4.009009009009009,
"grad_norm": 0.7191686034202576,
"learning_rate": 2.29927938422419e-05,
"loss": 0.2252,
"step": 890
},
{
"epoch": 4.054054054054054,
"grad_norm": 0.9094095826148987,
"learning_rate": 2.102499205596743e-05,
"loss": 0.2067,
"step": 900
},
{
"epoch": 4.099099099099099,
"grad_norm": 1.2016669511795044,
"learning_rate": 1.913528516662452e-05,
"loss": 0.2165,
"step": 910
},
{
"epoch": 4.1441441441441444,
"grad_norm": 1.6922552585601807,
"learning_rate": 1.7325541821885384e-05,
"loss": 0.2102,
"step": 920
},
{
"epoch": 4.1891891891891895,
"grad_norm": 1.52359139919281,
"learning_rate": 1.5597551597004966e-05,
"loss": 0.1765,
"step": 930
},
{
"epoch": 4.2342342342342345,
"grad_norm": 1.333765983581543,
"learning_rate": 1.3953023225189243e-05,
"loss": 0.2147,
"step": 940
},
{
"epoch": 4.2792792792792795,
"grad_norm": 0.9832772016525269,
"learning_rate": 1.23935829079042e-05,
"loss": 0.2068,
"step": 950
},
{
"epoch": 4.324324324324325,
"grad_norm": 0.7258216738700867,
"learning_rate": 1.0920772706797167e-05,
"loss": 0.1884,
"step": 960
},
{
"epoch": 4.36936936936937,
"grad_norm": 1.0229756832122803,
"learning_rate": 9.536049018820192e-06,
"loss": 0.2135,
"step": 970
},
{
"epoch": 4.414414414414415,
"grad_norm": 1.0085179805755615,
"learning_rate": 8.240781136063346e-06,
"loss": 0.1831,
"step": 980
},
{
"epoch": 4.45945945945946,
"grad_norm": 0.7446288466453552,
"learning_rate": 7.03624989172228e-06,
"loss": 0.198,
"step": 990
},
{
"epoch": 4.504504504504505,
"grad_norm": 0.8291650414466858,
"learning_rate": 5.9236463935389065e-06,
"loss": 0.2189,
"step": 1000
},
{
"epoch": 4.504504504504505,
"eval_loss": 0.9809222221374512,
"eval_runtime": 10.6739,
"eval_samples_per_second": 35.039,
"eval_steps_per_second": 4.403,
"step": 1000
},
{
"epoch": 4.54954954954955,
"grad_norm": 1.1298563480377197,
"learning_rate": 4.904070845967468e-06,
"loss": 0.1889,
"step": 1010
},
{
"epoch": 4.594594594594595,
"grad_norm": 1.0232703685760498,
"learning_rate": 3.9785314622310495e-06,
"loss": 0.1891,
"step": 1020
},
{
"epoch": 4.63963963963964,
"grad_norm": 1.2712104320526123,
"learning_rate": 3.1479434673440167e-06,
"loss": 0.1879,
"step": 1030
},
{
"epoch": 4.684684684684685,
"grad_norm": 1.564489722251892,
"learning_rate": 2.4131281930864002e-06,
"loss": 0.1972,
"step": 1040
},
{
"epoch": 4.72972972972973,
"grad_norm": 1.4100459814071655,
"learning_rate": 1.7748122658251876e-06,
"loss": 0.201,
"step": 1050
},
{
"epoch": 4.774774774774775,
"grad_norm": 1.3149417638778687,
"learning_rate": 1.2336268879856727e-06,
"loss": 0.1876,
"step": 1060
},
{
"epoch": 4.81981981981982,
"grad_norm": 0.8505904674530029,
"learning_rate": 7.901072138831511e-07,
"loss": 0.1722,
"step": 1070
},
{
"epoch": 4.864864864864865,
"grad_norm": 2.1957037448883057,
"learning_rate": 4.44691820532539e-07,
"loss": 0.1917,
"step": 1080
},
{
"epoch": 4.90990990990991,
"grad_norm": 1.9867583513259888,
"learning_rate": 1.977222739588891e-07,
"loss": 0.2082,
"step": 1090
},
{
"epoch": 4.954954954954955,
"grad_norm": 1.539480447769165,
"learning_rate": 4.9442791437848136e-08,
"loss": 0.2052,
"step": 1100
},
{
"epoch": 5.0,
"grad_norm": 0.6789027452468872,
"learning_rate": 0.0,
"loss": 0.197,
"step": 1110
},
{
"epoch": 5.0,
"step": 1110,
"total_flos": 1.01086802968209e+18,
"train_loss": 0.36593411194311604,
"train_runtime": 2755.6936,
"train_samples_per_second": 12.872,
"train_steps_per_second": 0.403
}
],
"logging_steps": 10,
"max_steps": 1110,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.01086802968209e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}